diff --git a/src/frame/packed_rgb_16bit.rs b/src/frame/packed_rgb_16bit.rs index e06607c..492546e 100644 --- a/src/frame/packed_rgb_16bit.rs +++ b/src/frame/packed_rgb_16bit.rs @@ -4,13 +4,21 @@ //! - `AV_PIX_FMT_RGBA64LE` → [`Rgba64Frame`] (R, G, B, A; stride in u16 elements ≥ 4 × width) //! - `AV_PIX_FMT_BGRA64LE` → [`Bgra64Frame`] (B, G, R, A; stride in u16 elements ≥ 4 × width) //! -//! Stride is in **u16 elements** (not bytes). Plane slice is `&[u16]`. -//! Callers with a raw FFmpeg byte buffer should cast via `bytemuck::cast_slice` -//! (which checks alignment at runtime) and divide `linesize[0]` by 2. Direct -//! pointer casts to `&[u16]` are undefined behaviour if the byte buffer is not -//! 2-byte aligned, and produce wrong values on big-endian hosts — all FFmpeg -//! `*LE` formats store samples little-endian, so big-endian targets would also -//! need per-sample `u16::from_le` conversion. +//! # Endian contract — **LE-encoded bytes** +//! +//! The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +//! `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +//! On a little-endian host (every CI runner today) LE bytes _are_ host-native, +//! so `&[u16]` is also a host-native u16 slice; on a big-endian host the bytes +//! have to be byte-swapped back to host-native before arithmetic. Downstream +//! row kernels handle this byte-swap (or no-op on LE) under the hood — +//! callers do **not** pre-swap. +//! +//! Stride is in **u16 elements** (not bytes). Callers holding a raw FFmpeg +//! byte buffer should cast via `bytemuck::cast_slice` (which checks alignment +//! at runtime) and divide `linesize[0]` by 2 before constructing. Direct +//! pointer casts to `&[u16]` are undefined behaviour if the byte buffer is +//! not 2-byte aligned. use derive_more::IsVariant; use thiserror::Error; @@ -62,10 +70,19 @@ pub enum Rgb48FrameError { } /// A validated packed **RGB48** frame (`AV_PIX_FMT_RGB48LE`) — three `u16` -/// samples per pixel in `R, G, B` order. Each `u16` is a native little-endian -/// sample; the caller is responsible for casting the raw FFmpeg byte buffer. +/// samples per pixel in `R, G, B` order. +/// +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. /// -/// `stride` is in **u16 elements** (≥ `3 * width`). +/// `stride` is in **u16 elements** (≥ `3 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgb48Frame<'a> { rgb48: &'a [u16], @@ -197,7 +214,17 @@ pub enum Bgr48FrameError { /// samples per pixel in `B, G, R` order. Channel order is reversed relative /// to [`Rgb48Frame`]; stride convention and element type are identical. /// -/// `stride` is in **u16 elements** (≥ `3 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `3 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Bgr48Frame<'a> { bgr48: &'a [u16], @@ -329,7 +356,17 @@ pub enum Rgba64FrameError { /// samples per pixel in `R, G, B, A` order. The alpha channel is real /// (not padding) and is passed through by `with_rgba` / `with_rgba_u16`. /// -/// `stride` is in **u16 elements** (≥ `4 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `4 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgba64Frame<'a> { rgba64: &'a [u16], @@ -462,7 +499,17 @@ pub enum Bgra64FrameError { /// first three elements relative to [`Rgba64Frame`]; alpha at position 3 is /// real and is passed through by `with_rgba` / `with_rgba_u16`. /// -/// `stride` is in **u16 elements** (≥ `4 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `4 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Bgra64Frame<'a> { bgra64: &'a [u16], diff --git a/src/row/arch/neon/packed_rgb.rs b/src/row/arch/neon/packed_rgb.rs index ccf26eb..1795438 100644 --- a/src/row/arch/neon/packed_rgb.rs +++ b/src/row/arch/neon/packed_rgb.rs @@ -517,55 +517,61 @@ unsafe fn x2_extract_10bit_u16_lane(pix: uint32x4_t, shift: i32) -> uint16x4_t { /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -584,55 +590,61 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// 3. `x2rgb10` / `rgba_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -651,37 +663,43 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - - // Channel low bit positions: R at 20, G at 10, B at 0. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + + // Channel low bit positions: R at 20, G at 10, B at 0. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -695,54 +713,60 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// B at >>22). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -754,55 +778,61 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// NEON X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -814,37 +844,43 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// NEON X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - - // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + + // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/neon/packed_rgb_16bit.rs b/src/row/arch/neon/packed_rgb_16bit.rs index 3370c8a..caf0c5f 100644 --- a/src/row/arch/neon/packed_rgb_16bit.rs +++ b/src/row/arch/neon/packed_rgb_16bit.rs @@ -21,6 +21,18 @@ //! - **Rgba64 / Bgra64:** `vld4q_u16(src_ptr)` → `uint16x8x4_t(ch0, ch1, ch2, ch3)`. //! For Bgra64, `ch0` = B and `ch2` = R (swapped on store). //! +//! ## Big-endian support +//! +//! Every public kernel accepts ``. Each per-channel +//! `uint16x8_t` vector produced by `vld3q_u16`/`vld4q_u16` is conditionally +//! byte-swapped via the canonical [`super::bswap_u16x8_if_be`] helper before +//! any channel math. The gate is `BE != HOST_NATIVE_BE`, so the swap fires +//! only when the wire endian differs from the host's native byte order — on +//! LE hosts (all current AArch64 hardware) reading LE data the helper is a +//! no-op and emits zero extra instructions; on BE hosts (e.g. `aarch64_be`) +//! reading LE data the swap fires to recover host-native u16 lanes for the +//! arithmetic that follows. +//! //! ## Depth conversion //! //! - **u16 → u8:** `vshrn_n_u16::<8>(v)` — high-byte extraction, matching @@ -33,6 +45,7 @@ use core::arch::aarch64::*; +use super::bswap_u16x8_if_be; use crate::row::scalar; // ============================================================================= @@ -44,6 +57,8 @@ use crate::row::scalar; /// `vld3q_u16` deinterleaves into `(R, G, B)` u16x8; `vshrn_n_u16::<8>` /// narrows each channel; `vst3_u8` interleaves back. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -51,7 +66,11 @@ use crate::row::scalar; /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -59,20 +78,22 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -80,7 +101,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -89,9 +114,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -99,14 +124,15 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// NEON Rgb48 → native-depth u16 RGB (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgb48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// -/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves — no narrowing. +/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves. +/// When `BE = true` each channel is byte-swapped to host-native order before storing. /// /// # Safety /// @@ -115,7 +141,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -125,18 +155,24 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each channel is byte-swapped to host-native order before storing. +/// /// # Safety /// /// 1. NEON must be available. @@ -144,7 +180,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -155,12 +195,17 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, alpha), + uint16x8x4_t( + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), + alpha, + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -173,6 +218,7 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `vld3q_u16` deinterleaves into `(B, G, R)` u16x8; channels are swapped /// (`px.2` = R, `px.0` = B) in the `vst3_u8` call to produce R-first output. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -181,7 +227,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -190,20 +240,21 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R (source BGR order) let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); // R (was at position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (was at position 0) + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // R (was at position 2) + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); // B (was at position 0) vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap on output; alpha forced to 0xFF. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -212,7 +263,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -221,9 +276,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -231,13 +286,14 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap: `px.2` → position 0 (R), `px.0` → position 2 (B). +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -246,7 +302,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -257,18 +317,23 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 // Swap B↔R: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -277,7 +342,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -289,12 +358,17 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u // Store (R=px.2, G=px.1, B=px.0, A=0xFFFF) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, alpha), + uint16x8x4_t( + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), + alpha, + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -307,6 +381,7 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// /// `vld4q_u16` deinterleaves into `(R, G, B, A)` u16x8; R/G/B narrowed; /// `vst3_u8` writes only 3 channels. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -315,7 +390,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -323,15 +402,15 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -339,6 +418,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// NEON Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `vshrn_n_u16::<8>`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -347,7 +427,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -355,10 +439,10 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -366,7 +450,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,6 +458,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// NEON Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// /// `vld4q_u16` deinterleaves; `vst3q_u16` writes R, G, B channels only. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -382,7 +467,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -393,19 +482,24 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u // Alpha (px.3) discarded. vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } -/// NEON Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgba64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// /// `vld4q_u16` deinterleaves; `vst4q_u16` reinterleaves — source alpha preserved. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -414,7 +508,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -428,12 +522,17 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, px.3), + uint16x8x4_t( + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.3), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -446,6 +545,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `vld4q_u16` gives `(B, G, R, A)` → store `(R=px.2, G=px.1, B=px.0)`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -454,7 +554,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -463,21 +567,22 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R, px.3 = A let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); // R (from position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (from position 0) + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // R (from position 2) + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); // B (from position 0) // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -486,7 +591,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -494,10 +603,10 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -505,13 +614,14 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. `vld4q_u16` → `vst3q_u16(R, G, B)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -520,7 +630,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -531,12 +645,16 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u // Swap B↔R, drop alpha: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -545,6 +663,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// B↔R swap; source alpha preserved at position 3. /// /// `vld4q_u16` gives `(B, G, R, A)` → `vst4q_u16(R=px.2, G=px.1, B=px.0, A=px.3)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -553,7 +672,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -568,12 +687,17 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( // Swap B↔R, preserve A: store (R=px.2, G=px.1, B=px.0, A=px.3) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, px.3), + uint16x8x4_t( + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.3), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/neon/tests/packed_rgb.rs b/src/row/arch/neon/tests/packed_rgb.rs index 7e5ace2..39e15ea 100644 --- a/src/row/arch/neon/tests/packed_rgb.rs +++ b/src/row/arch/neon/tests/packed_rgb.rs @@ -261,9 +261,9 @@ fn x2rgb10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -276,9 +276,9 @@ fn x2rgb10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_neon, w); + x2rgb10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -291,9 +291,9 @@ fn x2rgb10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -306,9 +306,9 @@ fn x2bgr10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -321,9 +321,9 @@ fn x2bgr10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_neon, w); + x2bgr10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -336,10 +336,97 @@ fn x2bgr10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } } + +// ---- SIMD-level BE-vs-LE parity for X2RGB10 / X2BGR10 ------------------- +// +// The X2 SIMD bodies are LE-only (`if !BE` gate falls through to scalar for +// BE), but the parity test is still meaningful: `` exercises the SIMD +// body on LE bytes; `` exercises the scalar reference on BE bytes +// (which is where the host-independence of the byte-buffer construction +// matters). Both must produce identical output. Width 33 ensures the SIMD +// body executes (NEON does 16 px / iter). + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_x2rgb10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_x2bgr10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/neon/tests/packed_rgb_16bit.rs b/src/row/arch/neon/tests/packed_rgb_16bit.rs index ad38e7b..bee374c 100644 --- a/src/row/arch/neon/tests/packed_rgb_16bit.rs +++ b/src/row/arch/neon/tests/packed_rgb_16bit.rs @@ -33,8 +33,8 @@ fn neon_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgb: SIMD vs scalar mismatch"); } @@ -45,8 +45,8 @@ fn neon_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgba: SIMD vs scalar mismatch"); } @@ -57,8 +57,8 @@ fn neon_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16: SIMD vs scalar mismatch" @@ -72,8 +72,8 @@ fn neon_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16: SIMD vs scalar mismatch" @@ -91,8 +91,8 @@ fn neon_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgb: SIMD vs scalar mismatch"); } @@ -103,8 +103,8 @@ fn neon_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgba: SIMD vs scalar mismatch"); } @@ -115,8 +115,8 @@ fn neon_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16: SIMD vs scalar mismatch" @@ -130,8 +130,8 @@ fn neon_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16: SIMD vs scalar mismatch" @@ -149,8 +149,8 @@ fn neon_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgb: SIMD vs scalar mismatch"); } @@ -161,8 +161,8 @@ fn neon_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgba: SIMD vs scalar mismatch"); } @@ -173,8 +173,8 @@ fn neon_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16: SIMD vs scalar mismatch" @@ -188,8 +188,8 @@ fn neon_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16: SIMD vs scalar mismatch" @@ -207,8 +207,8 @@ fn neon_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgb: SIMD vs scalar mismatch"); } @@ -219,8 +219,8 @@ fn neon_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgba: SIMD vs scalar mismatch"); } @@ -231,8 +231,8 @@ fn neon_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16: SIMD vs scalar mismatch" @@ -246,8 +246,8 @@ fn neon_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16: SIMD vs scalar mismatch" @@ -265,8 +265,8 @@ fn neon_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -280,8 +280,8 @@ fn neon_rgba64_to_rgba_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 4]; let mut scalar_out = std::vec![0u8; 8 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 8); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgba exact-8: SIMD vs scalar mismatch" @@ -299,8 +299,8 @@ fn neon_rgb48_to_rgb_width1_scalar_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -314,10 +314,234 @@ fn neon_bgra64_to_rgba_u16_width1_scalar_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" ); } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// These probe the `bswap_u16x8_if_be` gate (`BE != HOST_NATIVE_BE`) at +// the SIMD layer. Existing tests above use `BE=false` only and never exercise +// the swap path. The fix in this commit replaces the broken +// `if BE { ... }` gate (which corrupted output on the BE host × LE wire and +// BE host × BE wire quadrants) with the canonical helper from `super::`. +// +// Buffers are constructed via `to_le_bytes` / `to_be_bytes` so semantics are +// host-independent: on every host, `le_buf` carries the intended values as +// LE-encoded bytes and `be_buf` carries the same values as BE-encoded bytes. +// Both `kernel(le_buf)` and `kernel(be_buf)` should +// therefore decode to the same intended host-native u16 values and produce +// identical RGB output. Mirrors PR #86's `87d682f` / `6924907` patterns. +// +// Width 17 = 2 × 8-lane SIMD body + 1 scalar tail, ensuring the SIMD body +// is exercised (not just the scalar tail). + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_rgb48_be_le_simd_parity_width17() { + let intended = make_rgb48_src(17, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_rgb48_to_rgb_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_rgb48_to_rgba_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_bgr48_be_le_simd_parity_width17() { + let intended = make_rgb48_src(17, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_bgr48_to_rgb_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_bgr48_to_rgba_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_rgba64_be_le_simd_parity_width17() { + let intended = make_rgba64_src(17, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_rgba64_to_rgb_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_rgba64_to_rgba_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_bgra64_be_le_simd_parity_width17() { + let intended = make_rgba64_src(17, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_bgra64_to_rgb_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_bgra64_to_rgba_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} diff --git a/src/row/arch/wasm_simd128/packed_rgb.rs b/src/row/arch/wasm_simd128/packed_rgb.rs index 49d1edb..53644ab 100644 --- a/src/row/arch/wasm_simd128/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/packed_rgb.rs @@ -623,99 +623,105 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); - - // Extract 10-bit channels as u32x4 (low 10 bits set per lane). - // X2RGB10: R at >>20, G at >>10, B at >>0. - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); - let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); - let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - let b2 = v128_and(p2, mask_3ff); - let b3 = v128_and(p3, mask_3ff); - - // Down-shift 10-bit → 8-bit. - let r0_u8 = u32x4_shr(r0, 2); - let r1_u8 = u32x4_shr(r1, 2); - let r2_u8 = u32x4_shr(r2, 2); - let r3_u8 = u32x4_shr(r3, 2); - let g0_u8 = u32x4_shr(g0, 2); - let g1_u8 = u32x4_shr(g1, 2); - let g2_u8 = u32x4_shr(g2, 2); - let g3_u8 = u32x4_shr(g3, 2); - let b0_u8 = u32x4_shr(b0, 2); - let b1_u8 = u32x4_shr(b1, 2); - let b2_u8 = u32x4_shr(b2, 2); - let b3_u8 = u32x4_shr(b3, 2); - - // u32x4 → u16x8 (saturating narrow). - let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); - let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); - let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); - let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); - let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); - let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); - - // u16x8 → u8x16. - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - // Interleave (R, G, B) into 48 packed bytes via the same - // 9-shuffle pattern used by the YUV→RGB kernels. - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); + + // Extract 10-bit channels as u32x4 (low 10 bits set per lane). + // X2RGB10: R at >>20, G at >>10, B at >>0. + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); + let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); + let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + let b2 = v128_and(p2, mask_3ff); + let b3 = v128_and(p3, mask_3ff); + + // Down-shift 10-bit → 8-bit. + let r0_u8 = u32x4_shr(r0, 2); + let r1_u8 = u32x4_shr(r1, 2); + let r2_u8 = u32x4_shr(r2, 2); + let r3_u8 = u32x4_shr(r3, 2); + let g0_u8 = u32x4_shr(g0, 2); + let g1_u8 = u32x4_shr(g1, 2); + let g2_u8 = u32x4_shr(g2, 2); + let g3_u8 = u32x4_shr(g3, 2); + let b0_u8 = u32x4_shr(b0, 2); + let b1_u8 = u32x4_shr(b1, 2); + let b2_u8 = u32x4_shr(b2, 2); + let b3_u8 = u32x4_shr(b3, 2); + + // u32x4 → u16x8 (saturating narrow). + let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); + let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); + let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); + let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); + let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); + let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); + + // u16x8 → u8x16. + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + // Interleave (R, G, B) into 48 packed bytes via the same + // 9-shuffle pattern used by the YUV→RGB kernels. + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -728,7 +734,11 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// to `0xFF`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -736,36 +746,38 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - - // Extract 10-bit channels into u32 lanes, down-shift to u8. - let r = v128_and(u32x4_shr(pix, 20), mask_3ff); - let g = v128_and(u32x4_shr(pix, 10), mask_3ff); - let b = v128_and(pix, mask_3ff); - let r = u32x4_shr(r, 2); - let g = u32x4_shr(g, 2); - let b = u32x4_shr(b, 2); - - // Pack (R, G, B, 0xFF) bytes per pixel. - // Each channel value is in low byte of its u32 lane. - // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + + // Extract 10-bit channels into u32 lanes, down-shift to u8. + let r = v128_and(u32x4_shr(pix, 20), mask_3ff); + let g = v128_and(u32x4_shr(pix, 10), mask_3ff); + let b = v128_and(pix, mask_3ff); + let r = u32x4_shr(r, 2); + let g = u32x4_shr(g, 2); + let b = u32x4_shr(b, 2); + + // Pack (R, G, B, 0xFF) bytes per pixel. + // Each channel value is in low byte of its u32 lane. + // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -777,72 +789,78 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2RGB10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - // Interleave (R, G, B) u16x8 into 24 u16 elements. - // Element granularity is u16 (2 bytes); shuffle masks below - // index by byte. For u16-per-element interleave, byte mask - // pulls 2 consecutive bytes per element. - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + // Interleave (R, G, B) u16x8 into 24 u16 elements. + // Element granularity is u16 (2 bytes); shuffle masks below + // index by byte. For u16-per-element interleave, byte mask + // pulls 2 consecutive bytes per element. + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -855,80 +873,86 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// extracts R from low bits and B from high bits. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); - - // X2BGR10: R at low 10, G at >>10, B at >>20. - let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); - let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); - let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); - let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); - let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); - let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); - let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); - let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); - let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); - let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); - let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); - let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); - - let r_lo = u16x8_narrow_i32x4(r0, r1); - let r_hi = u16x8_narrow_i32x4(r2, r3); - let g_lo = u16x8_narrow_i32x4(g0, g1); - let g_hi = u16x8_narrow_i32x4(g2, g3); - let b_lo = u16x8_narrow_i32x4(b0, b1); - let b_hi = u16x8_narrow_i32x4(b2, b3); - - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); + + // X2BGR10: R at low 10, G at >>10, B at >>20. + let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); + let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); + let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); + let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); + let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); + let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); + let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); + let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); + let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); + let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); + let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); + let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); + + let r_lo = u16x8_narrow_i32x4(r0, r1); + let r_hi = u16x8_narrow_i32x4(r2, r3); + let g_lo = u16x8_narrow_i32x4(g0, g1); + let g_hi = u16x8_narrow_i32x4(g2, g3); + let b_lo = u16x8_narrow_i32x4(b0, b1); + let b_hi = u16x8_narrow_i32x4(b2, b3); + + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -941,7 +965,11 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// holds 4 RGBA pixels). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -949,30 +977,32 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - - // X2BGR10 channel positions: R at low, G mid, B high. - let r = u32x4_shr(v128_and(pix, mask_3ff), 2); - let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); - let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); - - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + + // X2BGR10 channel positions: R at low, G mid, B high. + let r = u32x4_shr(v128_and(pix, mask_3ff), 2); + let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); + let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); + + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -984,68 +1014,74 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(p0, mask_3ff); - let r1 = v128_and(p1, mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(p0, mask_3ff); + let r1 = v128_and(p1, mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs index 087eb8f..5c4c24f 100644 --- a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs @@ -217,6 +217,42 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { u8x16_narrow_i16x8(shr, zero) } +// ---- endian byte-swap helper ------------------------------------------------- + +/// Compile-time host endianness. `true` on BE targets, `false` on LE. +/// +/// Used by [`byteswap_if_be`] to gate the swap on `BE != HOST_NATIVE_BE`, +/// covering all four `wire × host` quadrants. Mirrors the gate established +/// in the canonical NEON `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in `v` so the returned value is in +/// **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// Uses `u8x16_swizzle` with a compile-time mask. The unused branch folds +/// at compile time since both `BE` and `HOST_NATIVE_BE` are constants. +#[inline(always)] +unsafe fn byteswap_if_be(v: v128) -> v128 { + if BE != HOST_NATIVE_BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + u8x16_swizzle( + v, + i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + ) + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -234,7 +270,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -242,9 +282,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -257,7 +297,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -271,7 +311,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -280,9 +324,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -294,7 +338,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -308,7 +352,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -316,15 +364,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -338,7 +386,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -347,15 +399,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,7 +426,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -382,9 +438,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -396,7 +452,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -411,7 +467,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -420,9 +480,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -433,7 +493,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -448,7 +508,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -456,16 +520,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B order write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -480,7 +544,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -489,16 +557,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B, A order write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -517,7 +585,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -525,10 +597,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -539,7 +611,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -554,7 +626,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -562,10 +638,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -577,7 +653,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -592,7 +668,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -600,16 +680,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -624,7 +704,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -636,16 +716,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,7 +746,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -674,10 +758,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -689,7 +773,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -704,7 +788,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -712,10 +800,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -727,7 +815,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -742,7 +830,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -750,17 +842,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0) let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -775,7 +867,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -787,17 +879,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0, A=ch3) let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb.rs b/src/row/arch/wasm_simd128/tests/packed_rgb.rs index dbd979a..6e99d43 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb.rs @@ -207,9 +207,9 @@ fn simd128_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -224,9 +224,9 @@ fn simd128_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_wasm, w); + x2rgb10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -241,9 +241,9 @@ fn simd128_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -258,9 +258,9 @@ fn simd128_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -275,9 +275,9 @@ fn simd128_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_wasm, w); + x2bgr10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -292,9 +292,9 @@ fn simd128_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs index e2ab1f7..183e036 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn wasm_rgb48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xDEAD_BEEF_1234_5678); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb diverges (width={w})"); } } @@ -49,8 +49,8 @@ fn wasm_rgb48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xCAFE_BABE_DEAD_1234); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba diverges (width={w})"); } } @@ -62,8 +62,8 @@ fn wasm_rgb48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xFEED_FACE_ABCD_EF01); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb_u16 diverges (width={w})"); } } @@ -75,8 +75,8 @@ fn wasm_rgb48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x1234_5678_9ABC_DEF0); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba_u16 diverges (width={w})"); } } @@ -92,8 +92,8 @@ fn wasm_bgr48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xABCD_EF01_2345_6789); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb diverges (width={w})"); } } @@ -105,8 +105,8 @@ fn wasm_bgr48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x9876_5432_10FE_DCBA); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba diverges (width={w})"); } } @@ -118,8 +118,8 @@ fn wasm_bgr48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x0011_2233_4455_6677); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb_u16 diverges (width={w})"); } } @@ -131,8 +131,8 @@ fn wasm_bgr48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x8899_AABB_CCDD_EEFF); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba_u16 diverges (width={w})"); } } @@ -148,8 +148,8 @@ fn wasm_rgba64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xF0F0_F0F0_0F0F_0F0F); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb diverges (width={w})"); } } @@ -161,8 +161,8 @@ fn wasm_rgba64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x1357_9BDF_2468_ACE0); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba diverges (width={w})"); } } @@ -174,8 +174,8 @@ fn wasm_rgba64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x2468_ACE0_1357_9BDF); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb_u16 diverges (width={w})"); } } @@ -187,8 +187,8 @@ fn wasm_rgba64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x3C3C_C3C3_5A5A_A5A5); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba_u16 diverges (width={w})"); } } @@ -204,8 +204,8 @@ fn wasm_bgra64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x7654_3210_FEDC_BA98); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb diverges (width={w})"); } } @@ -217,8 +217,8 @@ fn wasm_bgra64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xAABB_CCDD_EEFF_0011); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba diverges (width={w})"); } } @@ -230,8 +230,8 @@ fn wasm_bgra64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x5566_7788_99AA_BBCC); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb_u16 diverges (width={w})"); } } @@ -243,8 +243,8 @@ fn wasm_bgra64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xDDEE_FF00_1122_3344); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba_u16 diverges (width={w})"); } } @@ -299,8 +299,8 @@ fn wasm_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 4]; let mut scalar_out = std::vec![0u16; 9 * 4]; - unsafe { wasm_rgba64_to_rgba_u16_row(&src, &mut simd_out, 9) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 9) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order: SIMD vs scalar mismatch (channel mixing?)" @@ -321,8 +321,8 @@ fn wasm_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 3]; let mut scalar_out = std::vec![0u16; 9 * 3]; - unsafe { wasm_rgba64_to_rgb_u16_row(&src, &mut simd_out, 9) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 9) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -341,8 +341,8 @@ fn wasm_bgra64_to_rgba_u16_lane_order_regression() { let src = make_bgra64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 4]; let mut scalar_out = std::vec![0u16; 9 * 4]; - unsafe { wasm_bgra64_to_rgba_u16_row(&src, &mut simd_out, 9) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 9) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order: SIMD vs scalar mismatch (B↔R swap or alpha?)" @@ -362,8 +362,8 @@ fn wasm_bgra64_to_rgb_u16_lane_order_regression() { let src = make_bgra64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 3]; let mut scalar_out = std::vec![0u16; 9 * 3]; - unsafe { wasm_bgra64_to_rgb_u16_row(&src, &mut simd_out, 9) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 9) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -374,3 +374,298 @@ fn wasm_bgra64_to_rgb_u16_lane_order_regression() { assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 17 = 2 × 8-lane wasm-simd128 SIMD body + 1 scalar tail. + +#[cfg(target_feature = "simd128")] +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_rgb48_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 3, 0xACE1_DEAD_BEEF_0001); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_rgb48_to_rgb_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_rgb48_to_rgba_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_bgr48_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 3, 0xBEEF_C0DE_FACE_0002); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_bgr48_to_rgb_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_bgr48_to_rgba_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_rgba64_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 4, 0xCAFE_F00D_BABE_0003); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_rgba64_to_rgb_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_rgba64_to_rgba_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_bgra64_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 4, 0xFEED_BEEF_FACE_0004); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_bgra64_to_rgb_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_bgra64_to_rgba_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in `tests/packed_rgb.rs` which is not +// declared in `tests/mod.rs`) so they are actually compiled and run. + +#[cfg(target_feature = "simd128")] +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +#[cfg(target_feature = "simd128")] +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_x2rgb10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_x2bgr10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_avx2/packed_rgb.rs b/src/row/arch/x86_avx2/packed_rgb.rs index bfae38a..b90174f 100644 --- a/src/row/arch/x86_avx2/packed_rgb.rs +++ b/src/row/arch/x86_avx2/packed_rgb.rs @@ -445,21 +445,27 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -471,21 +477,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2RGB10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -497,21 +509,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2RGB10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -523,21 +541,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX2 X2BGR10→RGB. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -549,21 +573,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2BGR10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -575,21 +605,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2BGR10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx2/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/packed_rgb_16bit.rs index 086a689..0bfa970 100644 --- a/src/row/arch/x86_avx2/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/packed_rgb_16bit.rs @@ -297,6 +297,62 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Compile-time host endianness. `true` on BE targets, `false` on LE. +/// +/// Used by the byte-swap helpers below to gate the swap on +/// `BE != HOST_NATIVE_BE`, covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in a `__m128i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE` — see [`byteswap256_if_be`] for the +/// full truth table. Uses `_mm_shuffle_epi8` (SSSE3 subset of AVX2). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE != HOST_NATIVE_BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Conditionally byte-swap every u16 lane in a `__m256i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// Uses `_mm256_shuffle_epi8` (AVX2). The unused branch folds at compile +/// time since both `BE` and `HOST_NATIVE_BE` are constants. +#[inline(always)] +unsafe fn byteswap256_if_be(v: __m256i) -> __m256i { + if BE != HOST_NATIVE_BE { + // Same u16-lane byte-swap mask, broadcast to both 128-bit lanes. + const MASK: __m256i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm256_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -307,6 +363,7 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// target_feature, exploiting that SSE4.1/SSSE3 are AVX2 subsets. Each half /// deinterleaves with shuffle masks, narrows via `>> 8`, writes 8 pixels /// (24 bytes). 16 pixels are produced per outer loop iteration. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -315,7 +372,11 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -327,9 +388,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let ptr = rgb48.as_ptr().add(x * 3); // First half: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -340,9 +401,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi // Second half: pixels x+8..x+15 let ptr8 = ptr.add(24); // 24 u16 ahead = 8 pixels × 3 channels - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -355,13 +416,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi } // Handle remaining pixels (< 16) via scalar fallback. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → packed u8 RGBA. 16 pixels per outer iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -369,7 +432,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -381,9 +448,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -393,9 +460,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -407,13 +474,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGB (identity repack). 16 pixels per iteration. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -421,7 +490,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -430,29 +503,31 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGBA. 16 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -460,7 +535,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -470,23 +549,23 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -499,6 +578,7 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -507,7 +587,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -517,9 +601,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -529,9 +613,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -543,13 +627,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 16; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → packed u8 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -558,7 +643,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -570,9 +659,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -582,9 +671,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -596,13 +685,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGB. 16 pixels per outer iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -611,7 +701,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -620,29 +714,30 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -651,7 +746,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -661,23 +760,23 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -691,6 +790,7 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// Loads 4 × `__m256i` (64 u16 = 16 pixels), deinterleaves via the /// cascade helper, narrows via `>> 8` + `packus_epi16` + lane fix, writes /// 16 pixels (48 bytes) via `write_rgb_16` on the low 128 bits. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -699,7 +799,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -708,10 +812,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -720,13 +824,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -735,7 +840,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -744,10 +853,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -757,13 +866,15 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGB. 16 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -771,7 +882,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -779,10 +894,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Write in two 8-pixel halves using the existing 128-bit helper. write_rgb_u16_8( @@ -800,13 +915,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGBA (identity copy). 16 pixels per iteration. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -815,7 +931,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -827,10 +943,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( _mm256_castsi256_si128(r_u16), @@ -849,7 +965,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( x += 16; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -862,6 +978,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_16px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -870,7 +987,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -879,10 +1000,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); @@ -892,13 +1013,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -907,7 +1029,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -916,10 +1042,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -929,13 +1055,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGB. 16 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -944,7 +1071,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -952,10 +1083,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Swap B↔R: store (R, G, B) write_rgb_u16_8( @@ -973,13 +1104,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -988,7 +1120,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1000,10 +1132,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( @@ -1023,7 +1155,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( x += 16; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx2/tests/packed_rgb.rs b/src/row/arch/x86_avx2/tests/packed_rgb.rs index 981c50e..16ea736 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb.rs @@ -231,9 +231,9 @@ fn avx2_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -251,9 +251,9 @@ fn avx2_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -271,9 +271,9 @@ fn avx2_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -291,9 +291,9 @@ fn avx2_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -311,9 +311,9 @@ fn avx2_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -331,9 +331,9 @@ fn avx2_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs index 1490d6e..1905f3c 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs @@ -63,8 +63,8 @@ fn avx2_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -80,8 +80,8 @@ fn avx2_rgb48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xF0F0); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-16: SIMD vs scalar mismatch" @@ -97,8 +97,8 @@ fn avx2_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -115,8 +115,8 @@ fn avx2_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -136,8 +136,8 @@ fn avx2_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -157,8 +157,8 @@ fn avx2_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -175,8 +175,8 @@ fn avx2_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -196,8 +196,8 @@ fn avx2_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -217,8 +217,8 @@ fn avx2_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn avx2_bgr48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xA1A1); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-16: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx2_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); // reuse helper (ch0 treated as B) let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx2_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx2_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx2_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx2_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx2_rgba64_to_rgb_exact16_matches_scalar() { let src = make_rgba64_src(16, 0x0F0F); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-16: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx2_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx2_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx2_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx2_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx2_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx2_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx2_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx2_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx2_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx2_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx2_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx2_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx2_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx2_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx2_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx2_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -722,7 +722,7 @@ fn avx2_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -739,7 +739,7 @@ fn avx2_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -755,7 +755,7 @@ fn avx2_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..17 { @@ -774,10 +774,321 @@ fn avx2_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 33 = 2 × 16-lane AVX2 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_rgb48_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgb48_src(33, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_rgb48_to_rgb_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_rgb48_to_rgba_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_rgb48_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_rgb48_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_bgr48_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgb48_src(33, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_bgr48_to_rgb_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_bgr48_to_rgba_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_bgr48_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_bgr48_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_rgba64_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgba64_src(33, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_rgba64_to_rgb_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_rgba64_to_rgba_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_rgba64_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_rgba64_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_bgra64_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgba64_src(33, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_bgra64_to_rgb_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_bgra64_to_rgba_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_bgra64_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_bgra64_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. +// Width 65 = 2 × 32-lane AVX2 SIMD body + 1 scalar tail. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_x2rgb10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 65); + x2rgb10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_x2bgr10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 65); + x2bgr10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_avx512/packed_rgb.rs b/src/row/arch/x86_avx512/packed_rgb.rs index 164804d..84d6e8c 100644 --- a/src/row/arch/x86_avx512/packed_rgb.rs +++ b/src/row/arch/x86_avx512/packed_rgb.rs @@ -446,23 +446,29 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -474,23 +480,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2RGB10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -502,23 +514,29 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2RGB10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -530,23 +548,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX-512 X2BGR10→RGB. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -558,23 +582,29 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2BGR10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -586,23 +616,29 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2BGR10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx512/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/packed_rgb_16bit.rs index 243fff8..b1f1634 100644 --- a/src/row/arch/x86_avx512/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/packed_rgb_16bit.rs @@ -240,6 +240,63 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { unsafe { _mm512_cvtusepi16_epi8(_mm512_srli_epi16::<8>(v)) } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Compile-time host endianness. `true` on BE targets, `false` on LE. +/// +/// Used by the byte-swap helpers below to gate the swap on +/// `BE != HOST_NATIVE_BE`, covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in a `__m128i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE` — see [`byteswap512_if_be`] for the +/// full truth table. Uses `_mm_shuffle_epi8` (SSSE3, a subset of AVX-512). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE != HOST_NATIVE_BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Conditionally byte-swap every u16 lane in a `__m512i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// Uses `_mm512_shuffle_epi8` (AVX-512BW). The unused branch folds at +/// compile time since both `BE` and `HOST_NATIVE_BE` are constants. +#[inline(always)] +unsafe fn byteswap512_if_be(v: __m512i) -> __m512i { + if BE != HOST_NATIVE_BE { + // Same u16-lane byte-swap mask, broadcast across all 64 bytes. + const MASK: __m512i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, + 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm512_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -249,6 +306,7 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// Processes four 8-pixel halves (3 × 128-bit loads each) under the /// AVX-512 target_feature context (SSE4.1/SSSE3 are subsets). Narrows /// each channel via `>> 8` and writes 8 pixels (24 bytes) per half. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -257,7 +315,11 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -268,9 +330,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], while x + 32 <= width { let ptr = rgb48.as_ptr().add(x * 3); // Half 0: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -281,9 +343,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 1: pixels x+8..x+15 let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -294,9 +356,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 2: pixels x+16..x+23 let ptr16 = ptr.add(48); - let v6 = _mm_loadu_si128(ptr16.cast()); - let v7 = _mm_loadu_si128(ptr16.add(8).cast()); - let v8 = _mm_loadu_si128(ptr16.add(16).cast()); + let v6 = byteswap128_if_be::(_mm_loadu_si128(ptr16.cast())); + let v7 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(8).cast())); + let v8 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(16).cast())); let (r2, g2, b2) = deinterleave_rgb48_8px(v6, v7, v8); let r2u8 = narrow_u16x8_to_u8x8(r2, zero); let g2u8 = narrow_u16x8_to_u8x8(g2, zero); @@ -307,9 +369,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 3: pixels x+24..x+31 let ptr24 = ptr.add(72); - let v9 = _mm_loadu_si128(ptr24.cast()); - let v10 = _mm_loadu_si128(ptr24.add(8).cast()); - let v11 = _mm_loadu_si128(ptr24.add(16).cast()); + let v9 = byteswap128_if_be::(_mm_loadu_si128(ptr24.cast())); + let v10 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(8).cast())); + let v11 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(16).cast())); let (r3, g3, b3) = deinterleave_rgb48_8px(v9, v10, v11); let r3u8 = narrow_u16x8_to_u8x8(r3, zero); let g3u8 = narrow_u16x8_to_u8x8(g3, zero); @@ -322,13 +384,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], } // Scalar tail: remaining < 32 pixels. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → packed u8 RGBA. 32 pixels per outer iteration. Alpha /// forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -337,7 +400,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -351,9 +418,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] macro_rules! process_half { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -372,13 +439,15 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGB (identity repack). 32 pixels per iter. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available. @@ -386,7 +455,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -397,9 +470,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u macro_rules! process_half_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -413,13 +486,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGBA. 32 pixels per iter. Alpha forced to /// 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -428,7 +502,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -444,9 +518,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( macro_rules! process_half_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -460,7 +534,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -471,6 +545,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// AVX-512 Bgr48 → packed u8 RGB. 32 pixels per outer iteration. /// B↔R swap via passing `(ch2, ch1, ch0)` to write helpers. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -479,7 +554,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -491,9 +570,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], macro_rules! process_half_bgr { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -512,13 +591,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], x += 32; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → packed u8 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -527,7 +607,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -541,9 +625,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] macro_rules! process_half_bgr_rgba { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -562,13 +646,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGB. 32 pixels per iter. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -577,7 +662,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -588,9 +677,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u macro_rules! process_half_bgr_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -604,13 +693,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -619,7 +709,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -635,9 +725,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( macro_rules! process_half_bgr_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -651,7 +741,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,6 +756,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 32 pixels (96 bytes) via `write_rgb_16` on 128-bit quarters. /// /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -674,7 +765,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -682,10 +777,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -707,13 +802,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -722,7 +818,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -730,10 +830,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -757,13 +857,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -772,7 +873,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -784,23 +885,24 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); // Use the shared write_rgb_u16_32 helper (writes 32 px = 4 × 8-px chunks). write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGBA (identity copy). 32 pixels per iter. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -809,7 +911,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -821,10 +923,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let opaque = _mm_set1_epi16(-1i16); // 0xFFFF placeholder — not used; a_u16 has real alpha let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -862,7 +964,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -875,6 +977,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_32px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -883,7 +986,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -891,10 +998,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); @@ -916,13 +1023,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -931,7 +1039,11 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -939,10 +1051,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -966,13 +1078,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -981,7 +1094,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -993,23 +1106,24 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -1018,7 +1132,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1030,10 +1144,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -1068,7 +1182,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx512/tests/packed_rgb.rs b/src/row/arch/x86_avx512/tests/packed_rgb.rs index 1cb18db..4fb00aa 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn avx512_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -263,9 +263,9 @@ fn avx512_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -283,9 +283,9 @@ fn avx512_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -303,9 +303,9 @@ fn avx512_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -323,9 +323,9 @@ fn avx512_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -343,9 +343,9 @@ fn avx512_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs index fe4c253..ab4afac 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs @@ -65,8 +65,8 @@ fn avx512_rgb48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0101); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=33: SIMD vs scalar mismatch" @@ -82,8 +82,8 @@ fn avx512_rgb48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xF0F0); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-32: SIMD vs scalar mismatch" @@ -99,8 +99,8 @@ fn avx512_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -117,8 +117,8 @@ fn avx512_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -138,8 +138,8 @@ fn avx512_rgb48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0303); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=33: SIMD vs scalar mismatch" @@ -159,8 +159,8 @@ fn avx512_rgb48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0505); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -177,8 +177,8 @@ fn avx512_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -198,8 +198,8 @@ fn avx512_rgb48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0707); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -219,8 +219,8 @@ fn avx512_bgr48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x1111); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=33: SIMD vs scalar mismatch" @@ -236,8 +236,8 @@ fn avx512_bgr48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xA1A1); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-32: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx512_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx512_bgr48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x2222); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=33: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx512_bgr48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x3333); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx512_bgr48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x4444); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx512_rgba64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0xAAAA); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=33: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx512_rgba64_to_rgb_exact32_matches_scalar() { let src = make_rgba64_src(32, 0x0F0F); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-32: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx512_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx512_rgba64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0xBBBB); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=33: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx512_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx512_rgba64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xCCCC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx512_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx512_rgba64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDDDD); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx512_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx512_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx512_bgra64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0x1234); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=33: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx512_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx512_bgra64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0x5678); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=33: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx512_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx512_bgra64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0x9ABC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx512_bgra64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDEF0); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx512_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx512_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -720,7 +720,7 @@ fn avx512_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -737,7 +737,7 @@ fn avx512_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -753,7 +753,7 @@ fn avx512_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..33 { @@ -772,10 +772,320 @@ fn avx512_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 65 = 2 × 32-lane AVX-512 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_rgb48_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgb48_src(65, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_rgb48_to_rgb_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_rgb48_to_rgba_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_rgb48_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_rgb48_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_bgr48_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgb48_src(65, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_bgr48_to_rgb_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_bgr48_to_rgba_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_bgr48_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_bgr48_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_rgba64_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgba64_src(65, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_rgba64_to_rgb_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_rgba64_to_rgba_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_rgba64_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_rgba64_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_bgra64_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgba64_src(65, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_bgra64_to_rgb_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_bgra64_to_rgba_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_bgra64_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_bgra64_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_x2rgb10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 65); + x2rgb10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_x2bgr10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 65); + x2bgr10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_sse41/packed_rgb.rs b/src/row/arch/x86_sse41/packed_rgb.rs index e5bb35e..12dccd4 100644 --- a/src/row/arch/x86_sse41/packed_rgb.rs +++ b/src/row/arch/x86_sse41/packed_rgb.rs @@ -426,18 +426,24 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -449,21 +455,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2RGB10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgba_16_pixels( - x2rgb10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgba_16_pixels( + x2rgb10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -476,21 +488,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// `u16`, max value `1023`). 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2rgb10_to_rgb_u16_8_pixels( - x2rgb10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2rgb10_to_rgb_u16_8_pixels( + x2rgb10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -502,18 +520,24 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// SSE4.1 X2BGR10→RGB. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -525,21 +549,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgba_16_pixels( - x2bgr10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgba_16_pixels( + x2bgr10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -551,21 +581,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// SSE4.1 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2bgr10_to_rgb_u16_8_pixels( - x2bgr10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2bgr10_to_rgb_u16_8_pixels( + x2bgr10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_sse41/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/packed_rgb_16bit.rs index b9dc50f..486279c 100644 --- a/src/row/arch/x86_sse41/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/packed_rgb_16bit.rs @@ -306,6 +306,44 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) } } +// ---- endian byte-swap helper ------------------------------------------------ + +/// Compile-time host endianness. `true` on BE targets, `false` on LE. +/// +/// Used by [`byteswap_if_be`] to gate the byte-swap on `BE != HOST_NATIVE_BE` +/// so the swap fires only when the wire endian differs from the host's +/// native byte order — covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in `v` so the returned value is in +/// **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of SSE4.1) with the same mask as +/// `endian::BYTESWAP_MASK_U16`. The unused branch folds at compile time +/// since `BE` and `HOST_NATIVE_BE` are both compile-time constants. +#[inline(always)] +unsafe fn byteswap_if_be(v: __m128i) -> __m128i { + if BE != HOST_NATIVE_BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -314,6 +352,7 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// /// Loads 3 × 128-bit chunks (24 u16), deinterleaves with shuffle masks, /// narrows via `>> 8`, writes 8 pixels (24 bytes) of interleaved RGB. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -322,7 +361,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -331,9 +374,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -345,13 +388,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -359,7 +404,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -370,9 +419,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -383,14 +432,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// SSE4.1 Rgb48 → native-depth u16 RGB (identity repack). 8 pixels per iteration. +/// SSE4.1 Rgb48 → native-depth u16 RGB. 8 pixels per iteration. /// /// Deinterleaves with shuffle masks, writes 8 pixels via `write_rgb_u16_8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -399,7 +449,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -407,21 +461,23 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → native-depth u16 RGBA. 8 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -429,7 +485,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -442,9 +498,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -456,7 +512,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -469,6 +525,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -477,7 +534,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -486,9 +547,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R (source BGR order) let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -500,13 +561,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -515,7 +577,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -526,9 +592,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -539,13 +605,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -554,7 +621,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -562,22 +633,23 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); // Store as R, G, B (swap applied by argument order) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -586,7 +658,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -599,9 +671,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -613,7 +685,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -624,6 +696,8 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// SSE4.1 Rgba64 → packed u8 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -631,7 +705,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -640,10 +718,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -654,7 +732,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -662,6 +740,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// SSE4.1 Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `>> 8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -670,7 +749,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -679,10 +762,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -694,13 +777,15 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -708,7 +793,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -720,16 +805,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -737,6 +822,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// SSE4.1 Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per iteration. /// /// All 4 channels passed through at native depth; source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -745,7 +831,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -757,16 +843,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -779,6 +865,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_8px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -787,7 +874,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -796,10 +887,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -811,13 +902,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -826,7 +918,11 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -835,10 +931,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -850,13 +946,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -865,7 +962,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -877,23 +974,24 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); // Swap B↔R: store (R, G, B) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -902,7 +1000,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -914,17 +1012,17 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_sse41/tests/packed_rgb.rs b/src/row/arch/x86_sse41/tests/packed_rgb.rs index 10f8192..e64aeba 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn sse41_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -263,9 +263,9 @@ fn sse41_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_sse, w); + x2rgb10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -283,9 +283,9 @@ fn sse41_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -303,9 +303,9 @@ fn sse41_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -323,9 +323,9 @@ fn sse41_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_sse, w); + x2bgr10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -343,9 +343,9 @@ fn sse41_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs index 319ee5f..34553ad 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn sse41_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -53,8 +53,8 @@ fn sse41_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -70,8 +70,8 @@ fn sse41_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -91,8 +91,8 @@ fn sse41_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -112,8 +112,8 @@ fn sse41_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -133,8 +133,8 @@ fn sse41_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -154,8 +154,8 @@ fn sse41_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -171,8 +171,8 @@ fn sse41_bgr48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xA1A1); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-8: SIMD vs scalar mismatch" @@ -192,8 +192,8 @@ fn sse41_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -213,8 +213,8 @@ fn sse41_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn sse41_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -255,8 +255,8 @@ fn sse41_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -272,8 +272,8 @@ fn sse41_rgba64_to_rgb_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-8: SIMD vs scalar mismatch" @@ -293,8 +293,8 @@ fn sse41_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -314,8 +314,8 @@ fn sse41_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -335,8 +335,8 @@ fn sse41_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -352,8 +352,8 @@ fn sse41_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -373,8 +373,8 @@ fn sse41_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn sse41_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -415,8 +415,8 @@ fn sse41_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -436,8 +436,8 @@ fn sse41_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -453,10 +453,322 @@ fn sse41_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" ); } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 17 = 2 × 8-lane SSE4.1 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_rgb48_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgb48_src(17, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_rgb48_to_rgb_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_rgb48_to_rgba_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_bgr48_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgb48_src(17, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_bgr48_to_rgb_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_bgr48_to_rgba_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_rgba64_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgba64_src(17, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_rgba64_to_rgb_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_rgba64_to_rgba_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_bgra64_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgba64_src(17, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_bgra64_to_rgb_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_bgra64_to_rgba_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. +// Width 33 = 2 × 16-lane SSE4.1 SIMD body + 1 scalar tail (u8 outputs); +// the u16 output kernel uses 8 px / iter, so 33 = 4 × 8 + 1. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_x2rgb10_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_x2bgr10_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/dispatch/ayuv64.rs b/src/row/dispatch/ayuv64.rs index 0d757ee..c598dc6 100644 --- a/src/row/dispatch/ayuv64.rs +++ b/src/row/dispatch/ayuv64.rs @@ -567,11 +567,19 @@ mod tests { /// Pack one AYUV64 pixel from explicit A / Y / U / V samples (16-bit /// native, no shift required). + /// + /// Helpers below are consumed only by the LE-host-gated tests in this + /// module (see the gating policy at the top of `mod tests`); on BE + /// hosts (s390x / powerpc64) those tests are skipped, so the helpers + /// would appear unused under `-D warnings`. Gate the helpers with the + /// same `target_endian = "little"` cfg. + #[cfg(target_endian = "little")] fn pack_ayuv64(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] { [a, y, u, v] } /// Pack one AYUV64 pixel in big-endian wire format. + #[cfg(target_endian = "little")] fn pack_ayuv64_be(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] { [ a.swap_bytes(), @@ -584,12 +592,14 @@ mod tests { /// Build a `Vec` AYUV64 row of `width` pixels with neutral /// chroma (U=V=32768) and the given Y / alpha values. Any positive /// width is valid (4:4:4, no chroma subsampling). + #[cfg(target_endian = "little")] fn solid_ayuv64(width: usize, y: u16, a: u16) -> std::vec::Vec { let quad = pack_ayuv64(a, y, 32768, 32768); (0..width).flat_map(|_| quad).collect() } /// Build a `Vec` AYUV64 row in big-endian wire format. + #[cfg(target_endian = "little")] fn solid_ayuv64_be(width: usize, y: u16, a: u16) -> std::vec::Vec { let quad = pack_ayuv64_be(a, y, 32768, 32768); (0..width).flat_map(|_| quad).collect() @@ -672,6 +682,13 @@ mod tests { // ---- functional smoke --------------------------------------------------- + // LE-host gate: this test builds host-native `Vec` fixtures and calls + // the dispatchers with `be_input = false`, which forwards to the scalar + // kernel's `from_le` load. On BE hosts (s390x / powerpc64) `from_le` swaps + // bytes, so the host-native fixture is corrupted before the math runs and + // the assertions break. BE-host correctness is covered by the per-arch BE + // parity tests that build fixtures via `to_le_bytes` / `to_be_bytes`. + #[cfg(target_endian = "little")] #[test] fn ayuv64_dispatchers_route_with_simd_false() { // Limited-range BT.709: Y=60160 = 235*256 is limited-range white; @@ -761,6 +778,15 @@ mod tests { } } + // LE-host gate: the LE side uses `solid_ayuv64` (host-native) with + // `be_input = false` (→ `from_le`); the BE side uses `pack_ayuv64_be` + // (`swap_bytes` of host-native) with `be_input = true` (→ `from_be`). + // Both encodings are LE-host-correct only — on BE host the byte order in + // memory does not match what the wrappers decode, so the test must be + // pinned to little-endian. Cross-endian agreement on BE host is verified + // by the per-arch BE parity tests that construct fixtures via + // `to_le_bytes` / `to_be_bytes`. + #[cfg(target_endian = "little")] #[test] fn ayuv64_be_and_le_dispatchers_agree() { // BE-encoded data decoded with be_input=true must produce the same diff --git a/src/row/dispatch/packed_rgb_16bit.rs b/src/row/dispatch/packed_rgb_16bit.rs index 6e31717..8dcdc8b 100644 --- a/src/row/dispatch/packed_rgb_16bit.rs +++ b/src/row/dispatch/packed_rgb_16bit.rs @@ -72,7 +72,12 @@ fn rgba64_packed_elems(width: usize) -> usize { /// Converts one row of `Rgb48` to packed u8 RGB. Each 16-bit channel is /// narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_row_endian( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -81,38 +86,51 @@ pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_row::(rgb48, rgb_out, width); +} + +/// LE-only wrapper around [`rgb48_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + rgb48_to_rgb_row_endian::(rgb48, rgb_out, width, use_simd) } /// Converts one row of `Rgb48` to packed u8 RGBA. Alpha forced to `0xFF`. /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_row_endian( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -121,38 +139,51 @@ pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_row::(rgb48, rgba_out, width); +} + +/// LE-only wrapper around [`rgb48_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + rgb48_to_rgba_row_endian::(rgb48, rgba_out, width, use_simd) } /// Converts one row of `Rgb48` to native-depth u16 RGB (identity copy). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_u16_row_endian( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -161,38 +192,51 @@ pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_u16_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); +} + +/// LE-only wrapper around [`rgb48_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + rgb48_to_rgb_u16_row_endian::(rgb48, rgb_out, width, use_simd) } /// Converts one row of `Rgb48` to native-depth u16 RGBA. Alpha forced to /// `0xFFFF`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_u16_row_endian( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -201,32 +245,40 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_u16_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); +} + +/// LE-only wrapper around [`rgb48_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + rgb48_to_rgba_u16_row_endian::(rgb48, rgba_out, width, use_simd) } /// Derives 8-bit luma from one row of `Rgb48` source. Narrows to u8 RGB via @@ -234,7 +286,7 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, /// `rgb_to_luma_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_row( +pub fn rgb48_to_luma_row_endian( rgb48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -248,17 +300,42 @@ pub fn rgb48_to_luma_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgb48_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_luma_row( + rgb48: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgb48_to_luma_row_endian::( + rgb48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Rgb48` source (Y' is computed at 8-bit /// precision and zero-extended). Narrows to u8 RGB via `rgb48_to_rgb_row` into /// `rgb_scratch`, then applies `rgb_to_luma_u16_row`. `use_simd = false` forces /// the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_u16_row( +pub fn rgb48_to_luma_u16_row_endian( rgb48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -272,16 +349,41 @@ pub fn rgb48_to_luma_u16_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgb48_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_luma_u16_row( + rgb48: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgb48_to_luma_u16_row_endian::( + rgb48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Rgb48` source (OpenCV 8-bit encoding). /// Narrows to u8 RGB via `rgb48_to_rgb_row` into `rgb_scratch`, then applies /// `rgb_to_hsv_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_hsv_row( +pub fn rgb48_to_hsv_row_endian( rgb48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -297,10 +399,27 @@ pub fn rgb48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`rgb48_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_hsv_row( + rgb48: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + rgb48_to_hsv_row_endian::(rgb48, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Bgr48 (B, G, R — 3 u16 elements per pixel) // ============================================================================= @@ -308,7 +427,12 @@ pub fn rgb48_to_hsv_row( /// Converts one row of `Bgr48` to packed u8 RGB (B↔R swap, narrow via `>> 8`). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_row_endian( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -317,38 +441,51 @@ pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_row::(bgr48, rgb_out, width); +} + +/// LE-only wrapper around [`bgr48_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + bgr48_to_rgb_row_endian::(bgr48, rgb_out, width, use_simd) } /// Converts one row of `Bgr48` to packed u8 RGBA (B↔R swap, alpha forced to /// `0xFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_row_endian( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -357,38 +494,51 @@ pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_row::(bgr48, rgba_out, width); +} + +/// LE-only wrapper around [`bgr48_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + bgr48_to_rgba_row_endian::(bgr48, rgba_out, width, use_simd) } /// Converts one row of `Bgr48` to native-depth u16 RGB (B↔R swap, values /// unchanged). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_u16_row_endian( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -397,38 +547,51 @@ pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_u16_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); +} + +/// LE-only wrapper around [`bgr48_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + bgr48_to_rgb_u16_row_endian::(bgr48, rgb_out, width, use_simd) } /// Converts one row of `Bgr48` to native-depth u16 RGBA (B↔R swap, alpha /// forced to `0xFFFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_u16_row_endian( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -437,39 +600,47 @@ pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_u16_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); +} + +/// LE-only wrapper around [`bgr48_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + bgr48_to_rgba_u16_row_endian::(bgr48, rgba_out, width, use_simd) } /// Derives 8-bit luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_row( +pub fn bgr48_to_luma_row_endian( bgr48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -483,15 +654,40 @@ pub fn bgr48_to_luma_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgr48_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_luma_row( + bgr48: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgr48_to_luma_row_endian::( + bgr48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_u16_row( +pub fn bgr48_to_luma_u16_row_endian( bgr48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -505,15 +701,40 @@ pub fn bgr48_to_luma_u16_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgr48_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_luma_u16_row( + bgr48: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgr48_to_luma_u16_row_endian::( + bgr48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_hsv_row( +pub fn bgr48_to_hsv_row_endian( bgr48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -529,10 +750,27 @@ pub fn bgr48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`bgr48_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_hsv_row( + bgr48: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + bgr48_to_hsv_row_endian::(bgr48, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Rgba64 (R, G, B, A — 4 u16 elements per pixel, source alpha real) // ============================================================================= @@ -540,7 +778,12 @@ pub fn bgr48_to_hsv_row( /// Converts one row of `Rgba64` to packed u8 RGB. Source alpha is discarded; /// R/G/B narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_row_endian( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -549,38 +792,51 @@ pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_row::(rgba64, rgb_out, width); +} + +/// LE-only wrapper around [`rgba64_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + rgba64_to_rgb_row_endian::(rgba64, rgb_out, width, use_simd) } /// Converts one row of `Rgba64` to packed u8 RGBA. All 4 channels narrowed via /// `>> 8`; source alpha passes through. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_row_endian( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -589,38 +845,51 @@ pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_row::(rgba64, rgba_out, width); +} + +/// LE-only wrapper around [`rgba64_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + rgba64_to_rgba_row_endian::(rgba64, rgba_out, width, use_simd) } /// Converts one row of `Rgba64` to native-depth u16 RGB. Source alpha /// discarded; R/G/B copied as-is. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_u16_row_endian( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -629,38 +898,51 @@ pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_u16_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); +} + +/// LE-only wrapper around [`rgba64_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + rgba64_to_rgb_u16_row_endian::(rgba64, rgb_out, width, use_simd) } /// Converts one row of `Rgba64` to native-depth u16 RGBA (identity copy of all /// 4 channels; source alpha preserved). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_u16_row_endian( + rgba64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -669,32 +951,40 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_u16_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); +} + +/// LE-only wrapper around [`rgba64_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + rgba64_to_rgba_u16_row_endian::(rgba64, rgba_out, width, use_simd) } /// Derives 8-bit luma from one row of `Rgba64` source. Narrows to u8 RGB via @@ -702,7 +992,7 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_row( +pub fn rgba64_to_luma_row_endian( rgba64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -716,16 +1006,41 @@ pub fn rgba64_to_luma_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgba64_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_luma_row( + rgba64: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgba64_to_luma_row_endian::( + rgba64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Rgba64` source. Narrows to u8 RGB via /// `rgba64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_u16_row( +pub fn rgba64_to_luma_u16_row_endian( rgba64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -739,16 +1054,41 @@ pub fn rgba64_to_luma_u16_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgba64_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_luma_u16_row( + rgba64: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgba64_to_luma_u16_row_endian::( + rgba64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Rgba64` source. Narrows to u8 RGB via /// `rgba64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_hsv_row( +pub fn rgba64_to_hsv_row_endian( rgba64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -764,10 +1104,27 @@ pub fn rgba64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`rgba64_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_hsv_row( + rgba64: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + rgba64_to_hsv_row_endian::(rgba64, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Bgra64 (B, G, R, A — 4 u16 elements per pixel, source alpha real) // ============================================================================= @@ -775,7 +1132,12 @@ pub fn rgba64_to_hsv_row( /// Converts one row of `Bgra64` to packed u8 RGB (B↔R swap, drop alpha, /// narrow via `>> 8`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_row_endian( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -784,39 +1146,52 @@ pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_row::(bgra64, rgb_out, width); +} + +/// LE-only wrapper around [`bgra64_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + bgra64_to_rgb_row_endian::(bgra64, rgb_out, width, use_simd) } /// Converts one row of `Bgra64` to packed u8 RGBA (B↔R swap, all 4 channels /// narrowed via `>> 8`; source alpha passes through). `use_simd = false` forces /// the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_row_endian( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -825,38 +1200,51 @@ pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_row::(bgra64, rgba_out, width); +} + +/// LE-only wrapper around [`bgra64_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + bgra64_to_rgba_row_endian::(bgra64, rgba_out, width, use_simd) } /// Converts one row of `Bgra64` to native-depth u16 RGB (B↔R swap, drop alpha, /// values copied as-is). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_u16_row_endian( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -865,38 +1253,51 @@ pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_u16_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); +} + +/// LE-only wrapper around [`bgra64_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + bgra64_to_rgb_u16_row_endian::(bgra64, rgb_out, width, use_simd) } /// Converts one row of `Bgra64` to native-depth u16 RGBA (B↔R swap; source /// alpha preserved at position 3). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_u16_row_endian( + bgra64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -905,32 +1306,40 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_u16_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); +} + +/// LE-only wrapper around [`bgra64_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + bgra64_to_rgba_u16_row_endian::(bgra64, rgba_out, width, use_simd) } /// Derives 8-bit luma from one row of `Bgra64` source. Narrows to u8 RGB via @@ -938,7 +1347,7 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_row( +pub fn bgra64_to_luma_row_endian( bgra64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -952,16 +1361,41 @@ pub fn bgra64_to_luma_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgra64_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_luma_row( + bgra64: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgra64_to_luma_row_endian::( + bgra64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Bgra64` source. Narrows to u8 RGB via /// `bgra64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_u16_row( +pub fn bgra64_to_luma_u16_row_endian( bgra64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -975,16 +1409,41 @@ pub fn bgra64_to_luma_u16_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgra64_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_luma_u16_row( + bgra64: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgra64_to_luma_u16_row_endian::( + bgra64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Bgra64` source. Narrows to u8 RGB via /// `bgra64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_hsv_row( +pub fn bgra64_to_hsv_row_endian( bgra64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1000,10 +1459,27 @@ pub fn bgra64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`bgra64_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_hsv_row( + bgra64: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + bgra64_to_hsv_row_endian::(bgra64, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Tests // ============================================================================= @@ -1014,6 +1490,19 @@ mod tests { //! //! Each dispatcher's scalar fallback is exercised via `use_simd = false`. //! Overflow-guard tests are gated on 32-bit targets where `usize` is 32 bits. + //! + //! Many tests in this module build host-native `Vec` fixtures and + //! call the LE-only `*_endian::` wrappers, which apply + //! `u16::from_le` to each element. On big-endian hosts (s390x / + //! powerpc64) `from_le` swaps bytes, corrupting the fixture before the + //! conversion math runs. Such tests are gated with + //! `#[cfg(target_endian = "little")]`. Tests that use only + //! byte-symmetric values (`0x0000`, `0xFFFF`, `0x1111`, `0x2222`, + //! `0x3333`, ...) or that discard the only non-symmetric u16 (e.g. an + //! alpha that is dropped on RGB output) are host-endian-invariant and + //! left ungated. BE-host correctness of the underlying kernels is + //! covered by the per-arch BE parity tests that construct fixtures via + //! `to_le_bytes` / `to_be_bytes`. use super::*; // ---- helpers ------------------------------------------------------------- @@ -1035,53 +1524,57 @@ mod tests { // All-white Rgb48: each u16 channel = 0xFFFF; narrowed >> 8 = 0xFF. let src = solid_rgb48(4, 0xFFFF); let mut rgb = std::vec![0u8; 4 * 3]; - rgb48_to_rgb_row(&src, &mut rgb, 4, false); + rgb48_to_rgb_row_endian::(&src, &mut rgb, 4, false); assert!( rgb.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {rgb:?}" ); } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgba_scalar_path() { let src = solid_rgb48(4, 0x1200); let mut rgba = std::vec![0u8; 4 * 4]; - rgb48_to_rgba_row(&src, &mut rgba, 4, false); + rgb48_to_rgba_row_endian::(&src, &mut rgba, 4, false); for px in rgba.chunks(4) { assert_eq!(px[0], 0x12, "R channel"); assert_eq!(px[3], 0xFF, "alpha forced to 0xFF"); } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgb_u16_scalar_path() { let src = solid_rgb48(4, 0xABCD); let mut rgb_u16 = std::vec![0u16; 4 * 3]; - rgb48_to_rgb_u16_row(&src, &mut rgb_u16, 4, false); + rgb48_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 4, false); assert!( rgb_u16.iter().all(|&v| v == 0xABCD), "expected identity copy" ); } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgba_u16_scalar_path() { let src = solid_rgb48(4, 0x1234); let mut rgba_u16 = std::vec![0u16; 4 * 4]; - rgb48_to_rgba_u16_row(&src, &mut rgba_u16, 4, false); + rgb48_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 4, false); for px in rgba_u16.chunks(4) { assert_eq!(px[0], 0x1234, "R channel"); assert_eq!(px[3], 0xFFFF, "alpha forced to 0xFFFF"); } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_luma_scalar_path() { // All-white Rgb48 (all channels = 0xFF00) → near-white luma in full-range BT.709. let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgb48_to_luma_row( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1095,12 +1588,13 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgb48_to_luma_u16_row( + rgb48_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1117,6 +1611,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_hsv_scalar_path() { // Pure red: R=0xFF00, G=0, B=0 → H=0, S=255, V≈255 in OpenCV encoding. @@ -1125,7 +1620,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgb48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgb48_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 0, "H for pure red must be 0"); assert_eq!(s[0], 255, "S for pure red must be 255"); assert!(v[0] >= 254, "V for pure red must be near 255, got {}", v[0]); @@ -1133,22 +1628,24 @@ mod tests { // ---- Bgr48 --------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_rgb_scalar_path() { // Bgr48 pixel [B=0x1100, G=0x2200, R=0x3300] → rgb [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300]; let mut rgb = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut rgb, 1, false); + bgr48_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_rgba_scalar_path() { let src = [0x1100u16, 0x2200, 0x3300]; let mut rgba = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut rgba, 1, false); + bgr48_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R"); assert_eq!(rgba[3], 0xFF, "alpha forced to 0xFF"); } @@ -1157,7 +1654,7 @@ mod tests { fn bgr48_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgb_u16 = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgr48_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1167,17 +1664,18 @@ mod tests { fn bgr48_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgba_u16 = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgr48_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R"); assert_eq!(rgba_u16[3], 0xFFFF, "alpha forced to 0xFFFF"); } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_luma_scalar_path() { let src = solid_rgb48(4, 0xFF00); // all channels = 0xFF00 let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgr48_to_luma_row( + bgr48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1191,12 +1689,13 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgr48_to_luma_u16_row( + bgr48_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1210,6 +1709,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_hsv_scalar_path() { // Pure blue in Bgr48 layout: B=0xFF00, G=0, R=0. @@ -1219,7 +1719,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgr48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgr48_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1231,23 +1731,25 @@ mod tests { // ---- Rgba64 -------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgb_scalar_path() { // Source alpha should be dropped; R/G/B narrowed. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; // R, G, B, A let mut rgb = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut rgb, 1, false); + rgba64_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x11, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x33, "B"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgba_scalar_path() { // Source alpha 0xABCD → 0xAB after >> 8. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut rgba, 1, false); + rgba64_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1255,29 +1757,31 @@ mod tests { fn rgba64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut rgb_u16 = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + rgba64_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x1111, "R"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x3333, "B"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgba_u16_scalar_path() { // Identity copy; source alpha preserved. let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut rgba_u16 = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + rgba64_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x1111, "R"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_luma_scalar_path() { // All-white Rgba64 (alpha irrelevant for luma path). let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgba64_to_luma_row( + rgba64_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1291,12 +1795,13 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgba64_to_luma_u16_row( + rgba64_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1313,6 +1818,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_hsv_scalar_path() { // Pure green Rgba64: R=0, G=0xFF00, B=0, A=anything → H=60, S=255, V≈255. @@ -1321,7 +1827,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgba64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgba64_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 60, "H for pure green must be 60 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure green must be 255"); assert!( @@ -1333,23 +1839,25 @@ mod tests { // ---- Bgra64 -------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgb_scalar_path() { // Bgra64: B=0x1100, G=0x2200, R=0x3300, A=0xDEAD → RGB [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut rgb = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut rgb, 1, false); + bgra64_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgba_scalar_path() { // Source alpha 0xABCD → 0xAB after >> 8; channels swapped. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut rgba, 1, false); + bgra64_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R (from position 2)"); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1358,27 +1866,29 @@ mod tests { fn bgra64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; // B, G, R, A let mut rgb_u16 = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgra64_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; // B, G, R, A let mut rgba_u16 = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgra64_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_luma_scalar_path() { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgra64_to_luma_row( + bgra64_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1392,12 +1902,13 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgra64_to_luma_u16_row( + bgra64_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1414,6 +1925,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_hsv_scalar_path() { // Pure blue in Bgra64 layout: B=0xFF00, G=0, R=0, A=any. @@ -1423,7 +1935,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgra64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgra64_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1440,7 +1952,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_input() { let src = [0u16; 2]; // needs 3 for width=1 let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1448,7 +1960,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_output() { let src = [0u16; 3]; let mut out = [0u8; 2]; // needs 3 - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1456,7 +1968,7 @@ mod tests { fn rgba64_to_rgb_row_rejects_short_input() { let src = [0u16; 3]; // needs 4 for width=1 let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1, false); + rgba64_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1464,7 +1976,7 @@ mod tests { fn rgba64_to_rgba_row_rejects_short_output() { let src = [0u16; 4]; let mut out = [0u8; 3]; // needs 4 - rgba64_to_rgba_row(&src, &mut out, 1, false); + rgba64_to_rgba_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1473,7 +1985,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 3]; let mut luma: [u8; 0] = []; - rgb48_to_luma_row( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1490,7 +2002,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 2]; // needs 3 let mut luma = [0u8; 1]; - rgb48_to_luma_row( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1521,7 +2033,7 @@ mod tests { fn rgb48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgb48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + rgb48_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1530,7 +2042,7 @@ mod tests { fn bgr48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgr48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + bgr48_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1539,7 +2051,7 @@ mod tests { fn rgba64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgba64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + rgba64_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } #[cfg(target_pointer_width = "32")] @@ -1548,6 +2060,6 @@ mod tests { fn bgra64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgra64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + bgra64_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } } diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index 9b93a40..f2c1dd8 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -948,7 +948,12 @@ pub fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize, use_simd /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_row_endian( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2rgb10.len() >= in_min, "x2rgb10 row too short"); @@ -958,34 +963,42 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); +} + +/// LE-only wrapper around [`x2rgb10_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { + x2rgb10_to_rgb_row_endian::(x2rgb10, rgb_out, width, use_simd) } /// Drops the 2-bit padding, down-shifts to 8 bits, and forces alpha @@ -993,7 +1006,12 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgba_row_endian( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2rgb10.len() >= rgba_min, "x2rgb10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1002,34 +1020,42 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::neon::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); + scalar::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); +} + +/// LE-only wrapper around [`x2rgb10_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { + x2rgb10_to_rgba_row_endian::(x2rgb10, rgba_out, width, use_simd) } /// Extracts each 10-bit channel into native-depth `u16` (low-bit @@ -1038,7 +1064,12 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_u16_row_endian( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes — match // the rest of the high-bit-depth dispatchers. @@ -1050,41 +1081,54 @@ pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); +} + +/// LE-only wrapper around [`x2rgb10_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { + x2rgb10_to_rgb_u16_row_endian::(x2rgb10, rgb_out, width, use_simd) } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_row`]. Channel /// positions in the source `u32` are reversed; output is still /// `R, G, B`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_row_endian( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2bgr10.len() >= in_min, "x2bgr10 row too short"); @@ -1094,39 +1138,52 @@ pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); +} + +/// LE-only wrapper around [`x2bgr10_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { + x2bgr10_to_rgb_row_endian::(x2bgr10, rgb_out, width, use_simd) } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgba_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgba_row_endian( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2bgr10.len() >= rgba_min, "x2bgr10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1135,39 +1192,52 @@ pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::neon::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); + scalar::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); +} + +/// LE-only wrapper around [`x2bgr10_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { + x2bgr10_to_rgba_row_endian::(x2bgr10, rgba_out, width, use_simd) } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_u16_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_u16_row_endian( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes. let rgb_min = rgb_row_elems(width); @@ -1178,32 +1248,40 @@ pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); +} + +/// LE-only wrapper around [`x2bgr10_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { + x2bgr10_to_rgb_u16_row_endian::(x2bgr10, rgb_out, width, use_simd) } diff --git a/src/row/mod.rs b/src/row/mod.rs index 714f0e4..b2502de 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -91,18 +91,34 @@ pub(crate) use dispatch::mono1bit::*; // parameter) are re-exported as `pub(crate)` for sinker use — the underlying // functions in `dispatch::packed_rgb_16bit` are `pub`, but only this // re-export visibility is visible outside the crate. +// +// Each function exists in two forms: +// - `foo` — backwards-compatible LE-only wrapper (no const generic), preserves +// the pre-Tier 8 public signature so existing little-endian downstream +// callers compile unchanged. +// - `foo_endian::` — endian-aware form (added in Tier 8 for +// the BE-on-BE-host plane contract). Used by sinker code internally. pub use dispatch::packed_rgb_16bit::{ - bgr48_to_rgb_row, bgr48_to_rgb_u16_row, bgr48_to_rgba_row, bgr48_to_rgba_u16_row, - bgra64_to_rgb_row, bgra64_to_rgb_u16_row, bgra64_to_rgba_row, bgra64_to_rgba_u16_row, - rgb48_to_rgb_row, rgb48_to_rgb_u16_row, rgb48_to_rgba_row, rgb48_to_rgba_u16_row, - rgba64_to_rgb_row, rgba64_to_rgb_u16_row, rgba64_to_rgba_row, rgba64_to_rgba_u16_row, + bgr48_to_rgb_row, bgr48_to_rgb_row_endian, bgr48_to_rgb_u16_row, bgr48_to_rgb_u16_row_endian, + bgr48_to_rgba_row, bgr48_to_rgba_row_endian, bgr48_to_rgba_u16_row, bgr48_to_rgba_u16_row_endian, + bgra64_to_rgb_row, bgra64_to_rgb_row_endian, bgra64_to_rgb_u16_row, bgra64_to_rgb_u16_row_endian, + bgra64_to_rgba_row, bgra64_to_rgba_row_endian, bgra64_to_rgba_u16_row, + bgra64_to_rgba_u16_row_endian, rgb48_to_rgb_row, rgb48_to_rgb_row_endian, rgb48_to_rgb_u16_row, + rgb48_to_rgb_u16_row_endian, rgb48_to_rgba_row, rgb48_to_rgba_row_endian, rgb48_to_rgba_u16_row, + rgb48_to_rgba_u16_row_endian, rgba64_to_rgb_row, rgba64_to_rgb_row_endian, rgba64_to_rgb_u16_row, + rgba64_to_rgb_u16_row_endian, rgba64_to_rgba_row, rgba64_to_rgba_row_endian, + rgba64_to_rgba_u16_row, rgba64_to_rgba_u16_row_endian, }; // luma + HSV variants take an extra rgb_scratch parameter — sinker wired in Task 9. #[allow(unused_imports)] pub(crate) use dispatch::packed_rgb_16bit::{ - bgr48_to_hsv_row, bgr48_to_luma_row, bgr48_to_luma_u16_row, bgra64_to_hsv_row, - bgra64_to_luma_row, bgra64_to_luma_u16_row, rgb48_to_hsv_row, rgb48_to_luma_row, - rgb48_to_luma_u16_row, rgba64_to_hsv_row, rgba64_to_luma_row, rgba64_to_luma_u16_row, + bgr48_to_hsv_row, bgr48_to_hsv_row_endian, bgr48_to_luma_row, bgr48_to_luma_row_endian, + bgr48_to_luma_u16_row, bgr48_to_luma_u16_row_endian, bgra64_to_hsv_row, bgra64_to_hsv_row_endian, + bgra64_to_luma_row, bgra64_to_luma_row_endian, bgra64_to_luma_u16_row, + bgra64_to_luma_u16_row_endian, rgb48_to_hsv_row, rgb48_to_hsv_row_endian, rgb48_to_luma_row, + rgb48_to_luma_row_endian, rgb48_to_luma_u16_row, rgb48_to_luma_u16_row_endian, rgba64_to_hsv_row, + rgba64_to_hsv_row_endian, rgba64_to_luma_row, rgba64_to_luma_row_endian, rgba64_to_luma_u16_row, + rgba64_to_luma_u16_row_endian, }; // Gray dispatchers are pub(crate) — sinker code uses them via crate::row::gray*_row. #[cfg(any(feature = "std", feature = "alloc"))] diff --git a/src/row/scalar/packed_rgb.rs b/src/row/scalar/packed_rgb.rs index f1c2862..0f4091f 100644 --- a/src/row/scalar/packed_rgb.rs +++ b/src/row/scalar/packed_rgb.rs @@ -306,12 +306,17 @@ pub(crate) fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize) { /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -330,12 +335,21 @@ pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -355,12 +369,21 @@ pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = ((pix >> 20) & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; @@ -377,12 +400,17 @@ pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -400,12 +428,21 @@ pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -423,12 +460,21 @@ pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = (pix & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; diff --git a/src/row/scalar/packed_rgb_16bit.rs b/src/row/scalar/packed_rgb_16bit.rs index d530eaa..e568c19 100644 --- a/src/row/scalar/packed_rgb_16bit.rs +++ b/src/row/scalar/packed_rgb_16bit.rs @@ -1,8 +1,13 @@ //! Scalar reference kernels for 16-bit packed RGB sources (Tier 8 finish). //! -//! Input planes are `&[u16]`. Each u16 sample is the native channel value -//! (range [0, 65535]). No endian conversion — caller deserialises LE bytes -//! to `&[u16]` before constructing the frame. +//! Input planes are `&[u16]`. Each u16 sample is either LE- or BE-encoded on +//! disk/wire; the `` const-generic parameter selects the +//! interpretation. When `BE = false` the input is LE-encoded; when `BE = true` +//! the input is BE-encoded. In both cases each element is converted to +//! host-native byte order on load via `u16::from_le` / `u16::from_be`, which +//! are no-ops when the source byte order already matches the host. This +//! mirrors the SIMD `load_endian_u16x*` helpers and keeps the scalar reference +//! correct on big-endian hosts (s390x). //! //! # Format layouts //! @@ -18,56 +23,101 @@ //! - u16 → u8: `(v >> 8) as u8` (high-byte extraction, matching Y216 / Ship 11d). //! - u16 → u16: identity copy (no scaling). +// ---- Endian load helper ------------------------------------------------------ + +/// Load one u16 element from a source whose byte order is selected by `BE`, +/// returning the value in host-native byte order. +/// +/// `u16::from_be` / `u16::from_le` are target-endian aware: each is a no-op +/// when the source byte order matches the host, and a `swap_bytes` otherwise. +/// This matches the SIMD `load_endian_u16x*` helpers and keeps the scalar +/// reference correct on big-endian hosts (s390x). +/// +/// The `if BE` branch is evaluated at compile time (monomorphization), so the +/// unused branch is entirely eliminated from the generated binary. +#[inline(always)] +fn load_u16(v: u16) -> u16 { + if BE { u16::from_be(v) } else { u16::from_le(v) } +} + // ---- Rgb48 family (3 u16 elements per pixel: R, G, B) ---------------------- /// Rgb48 → packed u8 RGB: narrow each 16-bit channel via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load so the channel +/// value is in host-native order before narrowing. +/// /// Input stride: `width * 3` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (rgb48[src] >> 8) as u8; - rgb_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; } } -/// Rgb48 → packed u16 RGB: identity copy (already R, G, B order). +/// Rgb48 → packed u16 RGB: copy with optional byte-swap (already R, G, B order). +/// +/// When `BE = true` each element is byte-swapped so the output contains +/// host-native u16 values. /// /// Input and output stride: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); - rgb_u16_out[..width * 3].copy_from_slice(&rgb48[..width * 3]); + if BE { + for i in 0..width * 3 { + rgb_u16_out[i] = u16::from_be(rgb48[i]); + } + } else { + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 3 { + rgb_u16_out[i] = u16::from_le(rgb48[i]); + } + } } /// Rgb48 → packed u8 RGBA: narrow each 16-bit channel via `>> 8`, force alpha = 0xFF. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 3` u16 elements, output: `width * 4` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (rgb48[src] >> 8) as u8; - rgba_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgba_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgba_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgba_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgba_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; rgba_out[dst + 3] = 0xFF; } } -/// Rgb48 → packed u16 RGBA: copy R/G/B as-is, force alpha = 0xFFFF. +/// Rgb48 → packed u16 RGBA: copy R/G/B (with optional byte-swap), force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 3` u16 elements, output: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -76,9 +126,9 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = rgb48[src]; - rgba_u16_out[dst + 1] = rgb48[src + 1]; - rgba_u16_out[dst + 2] = rgb48[src + 2]; + rgba_u16_out[dst] = load_u16::(rgb48[src]); + rgba_u16_out[dst + 1] = load_u16::(rgb48[src + 1]); + rgba_u16_out[dst + 2] = load_u16::(rgb48[src + 2]); rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -87,54 +137,70 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid /// Bgr48 → packed u8 RGB: narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (bgr48[src + 2] >> 8) as u8; // R (from B-G-R position 2) - rgb_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgr48[src] >> 8) as u8; // B (from B-G-R position 0) + rgb_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R (from B-G-R position 2) + rgb_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B (from B-G-R position 0) } } -/// Bgr48 → packed u16 RGB: copy with B↔R swap. +/// Bgr48 → packed u16 RGB: copy with B↔R swap (and optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_u16_out[dst] = bgr48[src + 2]; // R - rgb_u16_out[dst + 1] = bgr48[src + 1]; // G - rgb_u16_out[dst + 2] = bgr48[src]; // B + rgb_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgr48[src]); // B } } /// Bgr48 → packed u8 RGBA: narrow + B↔R swap + force alpha = 0xFF. +/// +/// When `BE = true` each u16 element is byte-swapped on load. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (bgr48[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgr48[src] >> 8) as u8; // B + rgba_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B rgba_out[dst + 3] = 0xFF; } } -/// Bgr48 → packed u16 RGBA: B↔R swap + force alpha = 0xFFFF. +/// Bgr48 → packed u16 RGBA: B↔R swap (+ optional byte-swap) + force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -143,9 +209,9 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = bgr48[src + 2]; // R - rgba_u16_out[dst + 1] = bgr48[src + 1]; // G - rgba_u16_out[dst + 2] = bgr48[src]; // B + rgba_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgr48[src]); // B rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -154,121 +220,171 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid /// Rgba64 → packed u8 RGB: drop alpha, narrow R/G/B via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 4` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (rgba64[src] >> 8) as u8; - rgb_out[dst + 1] = (rgba64[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgba64[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgba64[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgba64[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgba64[src + 2]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B as-is. +/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 4` u16 elements, output: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = rgba64[src]; - rgb_u16_out[dst + 1] = rgba64[src + 1]; - rgb_u16_out[dst + 2] = rgba64[src + 2]; + rgb_u16_out[dst] = load_u16::(rgba64[src]); + rgb_u16_out[dst + 1] = load_u16::(rgba64[src + 1]); + rgb_u16_out[dst + 2] = load_u16::(rgba64[src + 2]); } } /// Rgba64 → packed u8 RGBA: narrow all 4 channels via `>> 8` (source alpha passes through). /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input and output stride: `width * 4` elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - rgba_out[i] = (rgba64[i] >> 8) as u8; - rgba_out[i + 1] = (rgba64[i + 1] >> 8) as u8; - rgba_out[i + 2] = (rgba64[i + 2] >> 8) as u8; - rgba_out[i + 3] = (rgba64[i + 3] >> 8) as u8; + rgba_out[i] = (load_u16::(rgba64[i]) >> 8) as u8; + rgba_out[i + 1] = (load_u16::(rgba64[i + 1]) >> 8) as u8; + rgba_out[i + 2] = (load_u16::(rgba64[i + 2]) >> 8) as u8; + rgba_out[i + 3] = (load_u16::(rgba64[i + 3]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGBA: identity copy of all 4 channels. +/// Rgba64 → packed u16 RGBA: copy all 4 channels (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input and output stride: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgba_u16_row( + rgba64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, "rgba_u16_out row too short" ); - rgba_u16_out[..width * 4].copy_from_slice(&rgba64[..width * 4]); + if BE { + for i in 0..width * 4 { + rgba_u16_out[i] = u16::from_be(rgba64[i]); + } + } else { + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 4 { + rgba_u16_out[i] = u16::from_le(rgba64[i]); + } + } } // ---- Bgra64 family (4 u16 elements per pixel: B, G, R, A) ------------------ /// Bgra64 → packed u8 RGB: drop alpha, narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (bgra64[src + 2] >> 8) as u8; // R (from position 2) - rgb_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgra64[src] >> 8) as u8; // B (from position 0) + rgb_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R (from position 2) + rgb_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B (from position 0) } } -/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap. +/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap (+ optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = bgra64[src + 2]; // R - rgb_u16_out[dst + 1] = bgra64[src + 1]; // G - rgb_u16_out[dst + 2] = bgra64[src]; // B + rgb_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgra64[src]); // B } } /// Bgra64 → packed u8 RGBA: narrow via `>> 8`, swap B↔R, pass through source alpha. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all narrowed `>> 8`). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_out[dst] = (bgra64[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgra64[src] >> 8) as u8; // B - rgba_out[dst + 3] = (bgra64[src + 3] >> 8) as u8; // A + rgba_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B + rgba_out[dst + 3] = (load_u16::(bgra64[src + 3]) >> 8) as u8; // A } } -/// Bgra64 → packed u16 RGBA: B↔R swap, pass through source alpha unchanged. +/// Bgra64 → packed u16 RGBA: B↔R swap (+ optional byte-swap), pass through source alpha. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all native u16). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgba_u16_row( + bgra64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -277,10 +393,10 @@ pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], w for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_u16_out[dst] = bgra64[src + 2]; // R - rgba_u16_out[dst + 1] = bgra64[src + 1]; // G - rgba_u16_out[dst + 2] = bgra64[src]; // B - rgba_u16_out[dst + 3] = bgra64[src + 3]; // A (unchanged) + rgba_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgra64[src]); // B + rgba_u16_out[dst + 3] = load_u16::(bgra64[src + 3]); // A (byte-order corrected) } } @@ -297,7 +413,7 @@ mod tests { fn rgb48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u16; 3 * 4]; - rgb48_to_rgb_u16_row(&src, &mut out, 4); + rgb48_to_rgb_u16_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF, got {out:?}" @@ -309,7 +425,7 @@ mod tests { fn rgb48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u8; 3 * 4]; - rgb48_to_rgb_row(&src, &mut out, 4); + rgb48_to_rgb_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {out:?}" @@ -321,7 +437,7 @@ mod tests { fn rgb48_to_rgb_narrow_known_value() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1); + rgb48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x12, "R channel"); assert_eq!(out[1], 0x56, "G channel"); assert_eq!(out[2], 0x9A, "B channel"); @@ -332,7 +448,7 @@ mod tests { fn rgb48_to_rgba_forces_alpha_0xff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u8; 4]; - rgb48_to_rgba_row(&src, &mut out, 1); + rgb48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[3], 0xFF, "alpha must be 0xFF"); assert_eq!(out[0], 0xAA, "R"); assert_eq!(out[1], 0xBB, "G"); @@ -344,7 +460,7 @@ mod tests { fn rgb48_to_rgba_u16_forces_alpha_0xffff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u16; 4]; - rgb48_to_rgba_u16_row(&src, &mut out, 1); + rgb48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0xAAAA, "R"); assert_eq!(out[1], 0xBBBB, "G"); assert_eq!(out[2], 0xCCCC, "B"); @@ -358,7 +474,7 @@ mod tests { fn bgr48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u16; 3 * 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 3); + bgr48_to_rgb_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -367,7 +483,7 @@ mod tests { fn bgr48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u8; 3 * 3]; - bgr48_to_rgb_row(&src, &mut out, 3); + bgr48_to_rgb_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -378,7 +494,7 @@ mod tests { // Source pixel in BGR order: B=0x1234, G=0x5678, R=0x9ABC let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 1); + bgr48_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9ABC, "R (was at src[2])"); assert_eq!(out[1], 0x5678, "G (unchanged)"); assert_eq!(out[2], 0x1234, "B (was at src[0])"); @@ -389,7 +505,7 @@ mod tests { fn bgr48_to_rgb_channel_order_and_narrow() { let src = [0x1200u16, 0x5600, 0x9A00]; let mut out = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut out, 1); + bgr48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9A, "R"); assert_eq!(out[1], 0x56, "G"); assert_eq!(out[2], 0x12, "B"); @@ -400,7 +516,7 @@ mod tests { fn bgr48_to_rgba_channel_order_and_alpha() { let src = [0x1100u16, 0x2200, 0x3300]; let mut out = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut out, 1); + bgr48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -412,7 +528,7 @@ mod tests { fn bgr48_to_rgba_u16_channel_order_and_alpha() { let src = [0x1111u16, 0x2222, 0x3333]; let mut out = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut out, 1); + bgr48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -426,7 +542,7 @@ mod tests { fn rgba64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u16; 4 * 3]; - rgba64_to_rgba_u16_row(&src, &mut out, 3); + rgba64_to_rgba_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -435,7 +551,7 @@ mod tests { fn rgba64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u8; 4 * 3]; - rgba64_to_rgba_row(&src, &mut out, 3); + rgba64_to_rgba_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -445,7 +561,7 @@ mod tests { // R=0x1111, G=0x2222, B=0x3333, A=0xABCD let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut out = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut out, 1); + rgba64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -457,7 +573,7 @@ mod tests { fn rgba64_to_rgba_source_alpha_depth_converted() { let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut out = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut out, 1); + rgba64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -469,7 +585,7 @@ mod tests { fn rgba64_to_rgb_drops_alpha() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1); + rgba64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -480,7 +596,7 @@ mod tests { fn rgba64_to_rgb_u16_drops_alpha() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut out, 1); + rgba64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -493,7 +609,7 @@ mod tests { fn bgra64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u16; 4 * 2]; - bgra64_to_rgba_u16_row(&src, &mut out, 2); + bgra64_to_rgba_u16_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -502,7 +618,7 @@ mod tests { fn bgra64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u8; 4 * 2]; - bgra64_to_rgba_row(&src, &mut out, 2); + bgra64_to_rgba_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -512,7 +628,7 @@ mod tests { // Source in BGRA order: B=0x1111, G=0x2222, R=0x3333, A=0x4444 let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; let mut out = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut out, 1); + bgra64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R (from src[2])"); assert_eq!(out[1], 0x2222, "G (unchanged)"); assert_eq!(out[2], 0x1111, "B (from src[0])"); @@ -524,7 +640,7 @@ mod tests { fn bgra64_to_rgba_channel_order_and_alpha_narrowed() { let src = [0x1100u16, 0x2200, 0x3300, 0xAB00]; let mut out = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut out, 1); + bgra64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -536,7 +652,7 @@ mod tests { fn bgra64_to_rgb_drops_alpha_and_swaps() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut out, 1); + bgra64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -547,7 +663,7 @@ mod tests { fn bgra64_to_rgb_u16_drops_alpha_and_swaps() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut out, 1); + bgra64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -564,7 +680,7 @@ mod tests { 0x1100u16, 0x2200, 0x3300, 0x4400, 0x5500, 0x6600, 0x7700, 0x8800, 0x9900, ]; let mut out = [0u8; 9]; - rgb48_to_rgb_row(&src, &mut out, 3); + rgb48_to_rgb_row::(&src, &mut out, 3); assert_eq!(out[0], 0x11); assert_eq!(out[1], 0x22); assert_eq!(out[2], 0x33); @@ -584,7 +700,7 @@ mod tests { 0x5555, 0x6666, 0x7777, 0x8888, // pixel 1 ]; let mut out = [0u16; 8]; - rgba64_to_rgba_u16_row(&src, &mut out, 2); + rgba64_to_rgba_u16_row::(&src, &mut out, 2); assert_eq!(&out, &src, "identity copy must be byte-exact"); } @@ -598,8 +714,8 @@ mod tests { let mut rgb48_out = [0u8; 3]; let mut bgr48_out = [0u8; 3]; - rgb48_to_rgb_row(&rgb48_src, &mut rgb48_out, 1); - bgr48_to_rgb_row(&bgr48_src, &mut bgr48_out, 1); + rgb48_to_rgb_row::(&rgb48_src, &mut rgb48_out, 1); + bgr48_to_rgb_row::(&bgr48_src, &mut bgr48_out, 1); assert_eq!( rgb48_out, bgr48_out, diff --git a/src/sinker/mixed/packed_rgb_10bit.rs b/src/sinker/mixed/packed_rgb_10bit.rs index 4470663..2e171c2 100644 --- a/src/sinker/mixed/packed_rgb_10bit.rs +++ b/src/sinker/mixed/packed_rgb_10bit.rs @@ -32,8 +32,9 @@ use super::{ use crate::{ PixelSink, row::{ - rgb_to_hsv_row, rgb_to_luma_row, x2bgr10_to_rgb_row, x2bgr10_to_rgb_u16_row, - x2bgr10_to_rgba_row, x2rgb10_to_rgb_row, x2rgb10_to_rgb_u16_row, x2rgb10_to_rgba_row, + rgb_to_hsv_row, rgb_to_luma_row, x2bgr10_to_rgb_row_endian, x2bgr10_to_rgb_u16_row_endian, + x2bgr10_to_rgba_row_endian, x2rgb10_to_rgb_row_endian, x2rgb10_to_rgb_u16_row_endian, + x2rgb10_to_rgba_row_endian, }, yuv::{X2Bgr10, X2Bgr10Row, X2Bgr10Sink, X2Rgb10, X2Rgb10Row, X2Rgb10Sink}, }; @@ -149,7 +150,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { w, h, )?; - x2rgb10_to_rgb_row(x2rgb10_in, rgb_row, w, use_simd); + x2rgb10_to_rgb_row_endian::(x2rgb10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -177,7 +178,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { // u8 RGBA output (single-pass, dedicated kernel forces alpha). if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2rgb10_to_rgba_row(x2rgb10_in, rgba_row, w, use_simd); + x2rgb10_to_rgba_row_endian::(x2rgb10_in, rgba_row, w, use_simd); } // u16 native RGB output (10-bit precision preserved). @@ -193,7 +194,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2rgb10_to_rgb_u16_row(x2rgb10_in, rgb_u16_row, w, use_simd); + x2rgb10_to_rgb_u16_row_endian::(x2rgb10_in, rgb_u16_row, w, use_simd); } Ok(()) @@ -307,7 +308,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { w, h, )?; - x2bgr10_to_rgb_row(x2bgr10_in, rgb_row, w, use_simd); + x2bgr10_to_rgb_row_endian::(x2bgr10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -334,7 +335,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2bgr10_to_rgba_row(x2bgr10_in, rgba_row, w, use_simd); + x2bgr10_to_rgba_row_endian::(x2bgr10_in, rgba_row, w, use_simd); } if want_rgb_u16 { @@ -349,7 +350,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2bgr10_to_rgb_u16_row(x2bgr10_in, rgb_u16_row, w, use_simd); + x2bgr10_to_rgb_u16_row_endian::(x2bgr10_in, rgb_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index 49e34d8..1fa4a02 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -35,12 +35,13 @@ use super::{ use crate::{ PixelSink, row::{ - bgr48_to_rgb_row, bgr48_to_rgb_u16_row, bgr48_to_rgba_row, bgr48_to_rgba_u16_row, - bgra64_to_rgb_row, bgra64_to_rgb_u16_row, bgra64_to_rgba_row, bgra64_to_rgba_u16_row, - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, rgb_to_luma_row, - rgb_to_luma_u16_row, rgb48_to_rgb_row, rgb48_to_rgb_u16_row, rgb48_to_rgba_row, - rgb48_to_rgba_u16_row, rgba64_to_rgb_row, rgba64_to_rgb_u16_row, rgba64_to_rgba_row, - rgba64_to_rgba_u16_row, + bgr48_to_rgb_row_endian, bgr48_to_rgb_u16_row_endian, bgr48_to_rgba_row_endian, + bgr48_to_rgba_u16_row_endian, bgra64_to_rgb_row_endian, bgra64_to_rgb_u16_row_endian, + bgra64_to_rgba_row_endian, bgra64_to_rgba_u16_row_endian, expand_rgb_to_rgba_row, + expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, rgb_to_luma_row, rgb_to_luma_u16_row, + rgb48_to_rgb_row_endian, rgb48_to_rgb_u16_row_endian, rgb48_to_rgba_row_endian, + rgb48_to_rgba_u16_row_endian, rgba64_to_rgb_row_endian, rgba64_to_rgb_u16_row_endian, + rgba64_to_rgba_row_endian, rgba64_to_rgba_u16_row_endian, }, yuv::{ Bgr48, Bgr48Row, Bgr48Sink, Bgra64, Bgra64Row, Bgra64Sink, Rgb48, Rgb48Row, Rgb48Sink, Rgba64, @@ -206,7 +207,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // with_luma_u16, or with_hsv is attached. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgb48_to_rgb_row(in48, rgb_row, w, use_simd); + rgb48_to_rgb_row_endian::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -245,7 +246,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // u8 RGBA — single-pass kernel, alpha forced to 0xFF. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_row(in48, rgba_row, w, use_simd); + rgb48_to_rgba_row_endian::(in48, rgba_row, w, use_simd); } // u16 RGB — native passthrough. @@ -257,13 +258,13 @@ impl PixelSink for MixedSinker<'_, Rgb48> { height: h, channels: 3, })?; - rgb48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + rgb48_to_rgb_u16_row_endian::(in48, &mut buf[ps * 3..end], w, use_simd); } // u16 RGBA — native passthrough, alpha forced to 0xFFFF. if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + rgb48_to_rgba_u16_row_endian::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -426,7 +427,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgr48_to_rgb_row(in48, rgb_row, w, use_simd); + bgr48_to_rgb_row_endian::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -464,7 +465,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_row(in48, rgba_row, w, use_simd); + bgr48_to_rgba_row_endian::(in48, rgba_row, w, use_simd); } if let Some(buf) = rgb_u16.as_deref_mut() { @@ -475,12 +476,12 @@ impl PixelSink for MixedSinker<'_, Bgr48> { height: h, channels: 3, })?; - bgr48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + bgr48_to_rgb_u16_row_endian::(in48, &mut buf[ps * 3..end], w, use_simd); } if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + bgr48_to_rgba_u16_row_endian::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -667,7 +668,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -675,7 +676,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); return Ok(()); } @@ -683,7 +684,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { // and Strategy A+ RGBA fan-out. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgba64_to_rgb_row(in64, rgb_row, w, use_simd); + rgba64_to_rgb_row_endian::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -739,7 +740,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); } // ===== u16 path ===== @@ -754,7 +755,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - rgba64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + rgba64_to_rgb_u16_row_endian::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached — derive from the // just-computed u16 RGB row (writes α=0xFFFF), then overwrite α @@ -778,7 +779,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); } Ok(()) @@ -950,7 +951,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -958,14 +959,14 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); return Ok(()); } // u8 RGB staging path. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgra64_to_rgb_row(in64, rgb_row, w, use_simd); + bgra64_to_rgb_row_endian::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -1017,7 +1018,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); } // u16 RGB path. @@ -1031,7 +1032,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - bgra64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + bgra64_to_rgb_u16_row_endian::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached. if want_rgba_u16 { @@ -1052,7 +1053,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/tests/packed_rgb_16bit.rs b/src/sinker/mixed/tests/packed_rgb_16bit.rs index 17d506d..eb508f5 100644 --- a/src/sinker/mixed/tests/packed_rgb_16bit.rs +++ b/src/sinker/mixed/tests/packed_rgb_16bit.rs @@ -417,3 +417,64 @@ fn rgb48_multi_row_frame() { assert_eq!(out[10], 0xFF); assert_eq!(out[11], 0xFF); } + +// ---- BE-contract regression ----------------------------------------------- + +/// Rgb48 sinker LE-encoded plane decodes correctly on every host. +/// +/// The frame doc-comment contract (see `src/frame/packed_rgb_16bit.rs`) says +/// the `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16` (matching FFmpeg's `*LE` suffix). On a little-endian host LE bytes +/// are host-native — identity. On a big-endian host the bytes are swapped +/// relative to host-native, so the kernel must apply `u16::from_le` (kernel +/// generic `BE = false`) to recover the host-native sample before arithmetic. +/// +/// This test builds the plane from LE-encoded u16 patterns +/// (`intended.to_le()` on each sample) and asserts the sinker output matches +/// the host-native `intended` values bit-exact via the `with_rgb_u16` +/// (identity) path. On a BE host with a regressed pre-swap (caller swaps, +/// kernel swaps again → double swap) this would corrupt every sample. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri would skip +/// exactly the host where BE corruption would surface. +/// +/// Mirrors the `rgbf32_sinker_le_encoded_frame_decodes_correctly` pattern +/// added in PR #92's `5b42065` / `3b1d716`. +#[test] +fn rgb48_sinker_le_encoded_frame_decodes_correctly() { + // Mix high / mid / low / asymmetric byte patterns so any byte-swap regression + // shows up as a non-trivial mismatch (not just a no-op pattern). + let intended: Vec = (0..16 * 4 * 3) + .map(|i| match i % 4 { + 0 => 0x1234, + 1 => 0xABCD, + 2 => 0x00FF, + _ => 0xFF00, + }) + .collect(); + // Construct the plane as LE-encoded u16 (the documented `*LE` Frame + // contract). On LE host this is identity; on BE host the bit-pattern is + // byte-swapped so the kernel must `from_le` it back to host-native. + let pix: Vec = intended.iter().map(|&v| v.to_le()).collect(); + let src = Rgb48Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); + + // `with_rgb_u16` is the identity passthrough — the cleanest probe of the + // endian contract because no narrowing or arithmetic obscures the bit + // pattern. A single mismatched sample byte-swap would be unmissable. + let mut rgb_u16_out = vec![0u16; 16 * 4 * 3]; + let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) + .with_rgb_u16(&mut rgb_u16_out) + .unwrap(); + rgb48_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + // Output must be host-native intended values. On a BE host with a + // regressed pre-swap (caller swaps, kernel swaps again) this would be + // byte-swapped relative to `intended`. + assert_eq!( + rgb_u16_out, intended, + "Rgb48 sinker LE-encoded plane decoded incorrectly (BE-contract regression)" + ); +}