From 95b918a982e56bc04903114acfaf7c44c9131ead Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 00:43:10 +1200 Subject: [PATCH 1/6] feat(be-tier8): BE support for Rgb48/Bgr48/Rgba64/Bgra64/X2Rgb10/X2Bgr10 row kernels Add to all 6 packed-RGB-16bit and 10-bit format row kernels (dispatchers, scalars, all 5 arch backends, sinkers) so big-endian pixel sources can decode each format without a separate code path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon/packed_rgb.rs | 462 ++++++------ src/row/arch/neon/packed_rgb_16bit.rs | 276 +++++-- src/row/arch/neon/tests/packed_rgb.rs | 24 +- src/row/arch/neon/tests/packed_rgb_16bit.rs | 80 +-- src/row/arch/wasm_simd128/packed_rgb.rs | 674 +++++++++--------- src/row/arch/wasm_simd128/packed_rgb_16bit.rs | 250 ++++--- src/row/arch/wasm_simd128/tests/packed_rgb.rs | 24 +- .../wasm_simd128/tests/packed_rgb_16bit.rs | 64 +- src/row/arch/x86_avx2/packed_rgb.rs | 132 ++-- src/row/arch/x86_avx2/packed_rgb_16bit.rs | 335 ++++++--- src/row/arch/x86_avx2/tests/packed_rgb.rs | 24 +- .../arch/x86_avx2/tests/packed_rgb_16bit.rs | 136 ++-- src/row/arch/x86_avx512/packed_rgb.rs | 156 ++-- src/row/arch/x86_avx512/packed_rgb_16bit.rs | 287 +++++--- src/row/arch/x86_avx512/tests/packed_rgb.rs | 24 +- .../arch/x86_avx512/tests/packed_rgb_16bit.rs | 136 ++-- src/row/arch/x86_sse41/packed_rgb.rs | 120 ++-- src/row/arch/x86_sse41/packed_rgb_16bit.rs | 256 ++++--- src/row/arch/x86_sse41/tests/packed_rgb.rs | 24 +- .../arch/x86_sse41/tests/packed_rgb_16bit.rs | 88 +-- src/row/dispatch/packed_rgb_16bit.rs | 428 ++++++----- src/row/dispatch/rgb_ops.rs | 114 +-- src/row/scalar/packed_rgb.rs | 70 +- src/row/scalar/packed_rgb_16bit.rs | 301 +++++--- src/sinker/mixed/packed_rgb_10bit.rs | 12 +- src/sinker/mixed/packed_rgb_16bit.rs | 40 +- 26 files changed, 2733 insertions(+), 1804 deletions(-) diff --git a/src/row/arch/neon/packed_rgb.rs b/src/row/arch/neon/packed_rgb.rs index ccf26eb2..17954389 100644 --- a/src/row/arch/neon/packed_rgb.rs +++ b/src/row/arch/neon/packed_rgb.rs @@ -517,55 +517,61 @@ unsafe fn x2_extract_10bit_u16_lane(pix: uint32x4_t, shift: i32) -> uint16x4_t { /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + // X2RGB10: R at >>22, G at >>12, B at >>2 (top 8 of 10-bit). + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -584,55 +590,61 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// 3. `x2rgb10` / `rgba_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -651,37 +663,43 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); - - // Channel low bit positions: R at 20, G at 10, B at 0. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2rgb10.as_ptr().add(x * 4 + 16)); + + // Channel low bit positions: R at 20, G at 10, B at 0. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -695,54 +713,60 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// B at >>22). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgb = uint8x16x3_t(r, g, b); - vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgb = uint8x16x3_t(r, g, b); + vst3q_u8(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -754,55 +778,61 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// NEON X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let alpha = vdupq_n_u8(0xFF); let mut x = 0usize; - while x + 16 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); - let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); - - let r_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 2), - x2_extract_10bit_u8_lane(p1, 2), - ); - let r_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 2), - x2_extract_10bit_u8_lane(p3, 2), - ); - let g_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 12), - x2_extract_10bit_u8_lane(p1, 12), - ); - let g_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 12), - x2_extract_10bit_u8_lane(p3, 12), - ); - let b_lo = vcombine_u16( - x2_extract_10bit_u8_lane(p0, 22), - x2_extract_10bit_u8_lane(p1, 22), - ); - let b_hi = vcombine_u16( - x2_extract_10bit_u8_lane(p2, 22), - x2_extract_10bit_u8_lane(p3, 22), - ); - - let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); - let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); - let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); - - let rgba = uint8x16x4_t(r, g, b, alpha); - vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); - - x += 16; + if !BE { + while x + 16 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + let p2 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 32)); + let p3 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 48)); + + let r_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 2), + x2_extract_10bit_u8_lane(p1, 2), + ); + let r_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 2), + x2_extract_10bit_u8_lane(p3, 2), + ); + let g_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 12), + x2_extract_10bit_u8_lane(p1, 12), + ); + let g_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 12), + x2_extract_10bit_u8_lane(p3, 12), + ); + let b_lo = vcombine_u16( + x2_extract_10bit_u8_lane(p0, 22), + x2_extract_10bit_u8_lane(p1, 22), + ); + let b_hi = vcombine_u16( + x2_extract_10bit_u8_lane(p2, 22), + x2_extract_10bit_u8_lane(p3, 22), + ); + + let r = vcombine_u8(vqmovn_u16(r_lo), vqmovn_u16(r_hi)); + let g = vcombine_u8(vqmovn_u16(g_lo), vqmovn_u16(g_hi)); + let b = vcombine_u8(vqmovn_u16(b_lo), vqmovn_u16(b_hi)); + + let rgba = uint8x16x4_t(r, g, b, alpha); + vst4q_u8(rgba_out.as_mut_ptr().add(x * 4), rgba); + + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -814,37 +844,43 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// NEON X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); - let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); - - // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. - let r = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 0), - x2_extract_10bit_u16_lane(p1, 0), - ); - let g = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 10), - x2_extract_10bit_u16_lane(p1, 10), - ); - let b = vcombine_u16( - x2_extract_10bit_u16_lane(p0, 20), - x2_extract_10bit_u16_lane(p1, 20), - ); - - let rgb = uint16x8x3_t(r, g, b); - vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); - - x += 8; + if !BE { + while x + 8 <= width { + let p0 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4)); + let p1 = x2_load_le_u32x4(x2bgr10.as_ptr().add(x * 4 + 16)); + + // X2BGR10: R at low 10 bits, G at 10..19, B at 20..29. + let r = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 0), + x2_extract_10bit_u16_lane(p1, 0), + ); + let g = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 10), + x2_extract_10bit_u16_lane(p1, 10), + ); + let b = vcombine_u16( + x2_extract_10bit_u16_lane(p0, 20), + x2_extract_10bit_u16_lane(p1, 20), + ); + + let rgb = uint16x8x3_t(r, g, b); + vst3q_u16(rgb_out.as_mut_ptr().add(x * 3), rgb); + + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/neon/packed_rgb_16bit.rs b/src/row/arch/neon/packed_rgb_16bit.rs index 3370c8a9..b2a35d01 100644 --- a/src/row/arch/neon/packed_rgb_16bit.rs +++ b/src/row/arch/neon/packed_rgb_16bit.rs @@ -21,6 +21,14 @@ //! - **Rgba64 / Bgra64:** `vld4q_u16(src_ptr)` → `uint16x8x4_t(ch0, ch1, ch2, ch3)`. //! For Bgra64, `ch0` = B and `ch2` = R (swapped on store). //! +//! ## Big-endian support +//! +//! Every public kernel accepts ``. When `BE = true`, each +//! per-channel `uint16x8_t` vector produced by `vld3q_u16`/`vld4q_u16` is +//! byte-swapped via `byteswap_u16x8::` before any channel math. On LE +//! targets (all current AArch64 hardware) the helper is a no-op and emits +//! zero extra instructions. +//! //! ## Depth conversion //! //! - **u16 → u8:** `vshrn_n_u16::<8>(v)` — high-byte extraction, matching @@ -35,6 +43,25 @@ use core::arch::aarch64::*; use crate::row::scalar; +// ---- endian byte-swap helper ------------------------------------------------ + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Implemented as `vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v)))`, +/// the same transform used inside `load_be_u16x8` in the NEON endian module. +/// +/// # Safety +/// +/// Caller must have NEON enabled. +#[inline(always)] +unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { + if BE { + unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -44,6 +71,8 @@ use crate::row::scalar; /// `vld3q_u16` deinterleaves into `(R, G, B)` u16x8; `vshrn_n_u16::<8>` /// narrows each channel; `vst3_u8` interleaves back. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -51,7 +80,11 @@ use crate::row::scalar; /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -59,20 +92,22 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each channel vector is byte-swapped before narrowing. +/// /// # Safety /// /// 1. NEON must be available. @@ -80,7 +115,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -89,9 +128,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -99,14 +138,15 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// NEON Rgb48 → native-depth u16 RGB (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgb48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// -/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves — no narrowing. +/// `vld3q_u16` deinterleaves, `vst3q_u16` reinterleaves. +/// When `BE = true` each channel is byte-swapped to host-native order before storing. /// /// # Safety /// @@ -115,7 +155,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -125,18 +169,24 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Rgb48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each channel is byte-swapped to host-native order before storing. +/// /// # Safety /// /// 1. NEON must be available. @@ -144,7 +194,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -155,12 +209,17 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, alpha), + uint16x8x4_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + alpha, + ), ); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -173,6 +232,7 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `vld3q_u16` deinterleaves into `(B, G, R)` u16x8; channels are swapped /// (`px.2` = R, `px.0` = B) in the `vst3_u8` call to produce R-first output. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -181,7 +241,11 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -190,20 +254,21 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R (source BGR order) let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); // R (was at position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (was at position 0) + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (was at position 2) + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (was at position 0) vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap on output; alpha forced to 0xFF. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -212,7 +277,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -221,9 +290,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -231,13 +300,14 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap: `px.2` → position 0 (R), `px.0` → position 2 (B). +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -246,7 +316,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -257,18 +331,23 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 // Swap B↔R: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -277,7 +356,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -289,12 +372,17 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u // Store (R=px.2, G=px.1, B=px.0, A=0xFFFF) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, alpha), + uint16x8x4_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + alpha, + ), ); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -307,6 +395,7 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// /// `vld4q_u16` deinterleaves into `(R, G, B, A)` u16x8; R/G/B narrowed; /// `vst3_u8` writes only 3 channels. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -315,7 +404,11 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -323,15 +416,15 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -339,6 +432,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// NEON Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `vshrn_n_u16::<8>`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -347,7 +441,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -355,10 +453,10 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.0); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.2); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -366,7 +464,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,6 +472,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// NEON Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// /// `vld4q_u16` deinterleaves; `vst3q_u16` writes R, G, B channels only. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -382,7 +481,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -393,19 +496,24 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u // Alpha (px.3) discarded. vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.0, px.1, px.2), + uint16x8x3_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } -/// NEON Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per SIMD iteration. +/// NEON Rgba64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// /// `vld4q_u16` deinterleaves; `vst4q_u16` reinterleaves — source alpha preserved. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -414,7 +522,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -428,12 +536,17 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.0, px.1, px.2, px.3), + uint16x8x4_t( + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.3), + ), ); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -446,6 +559,7 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `vld4q_u16` gives `(B, G, R, A)` → store `(R=px.2, G=px.1, B=px.0)`. +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -454,7 +568,11 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -463,21 +581,22 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R, px.3 = A let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); // R (from position 2) - let g8 = vshrn_n_u16::<8>(px.1); // G (unchanged) - let b8 = vshrn_n_u16::<8>(px.0); // B (from position 0) + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (from position 2) + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (from position 0) // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// NEON Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each channel is byte-swapped before narrowing. /// /// # Safety /// @@ -486,7 +605,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -494,10 +617,10 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(px.2); - let g8 = vshrn_n_u16::<8>(px.1); - let b8 = vshrn_n_u16::<8>(px.0); - let a8 = vshrn_n_u16::<8>(px.3); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); + let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -505,13 +628,14 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// NEON Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. `vld4q_u16` → `vst3q_u16(R, G, B)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -520,7 +644,11 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -531,12 +659,16 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u // Swap B↔R, drop alpha: store (R=px.2, G=px.1, B=px.0) vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), - uint16x8x3_t(px.2, px.1, px.0), + uint16x8x3_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -545,6 +677,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// B↔R swap; source alpha preserved at position 3. /// /// `vld4q_u16` gives `(B, G, R, A)` → `vst4q_u16(R=px.2, G=px.1, B=px.0, A=px.3)`. +/// When `BE = true` each channel is byte-swapped to host-native order. /// /// # Safety /// @@ -553,7 +686,7 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -568,12 +701,17 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( // Swap B↔R, preserve A: store (R=px.2, G=px.1, B=px.0, A=px.3) vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), - uint16x8x4_t(px.2, px.1, px.0, px.3), + uint16x8x4_t( + byteswap_u16x8::(px.2), + byteswap_u16x8::(px.1), + byteswap_u16x8::(px.0), + byteswap_u16x8::(px.3), + ), ); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/neon/tests/packed_rgb.rs b/src/row/arch/neon/tests/packed_rgb.rs index 7e5ace29..8f597259 100644 --- a/src/row/arch/neon/tests/packed_rgb.rs +++ b/src/row/arch/neon/tests/packed_rgb.rs @@ -261,9 +261,9 @@ fn x2rgb10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -276,9 +276,9 @@ fn x2rgb10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_neon, w); + x2rgb10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -291,9 +291,9 @@ fn x2rgb10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_neon, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -306,9 +306,9 @@ fn x2bgr10_to_rgb_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_neon = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -321,9 +321,9 @@ fn x2bgr10_to_rgba_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_neon = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_neon, w); + x2bgr10_to_rgba_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } @@ -336,9 +336,9 @@ fn x2bgr10_to_rgb_u16_neon_matches_scalar_widths() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_neon = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_neon, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_neon, w); } assert_eq!(out_scalar, out_neon, "width {w}"); } diff --git a/src/row/arch/neon/tests/packed_rgb_16bit.rs b/src/row/arch/neon/tests/packed_rgb_16bit.rs index ad38e7b1..c71131d7 100644 --- a/src/row/arch/neon/tests/packed_rgb_16bit.rs +++ b/src/row/arch/neon/tests/packed_rgb_16bit.rs @@ -33,8 +33,8 @@ fn neon_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgb: SIMD vs scalar mismatch"); } @@ -45,8 +45,8 @@ fn neon_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgb48→rgba: SIMD vs scalar mismatch"); } @@ -57,8 +57,8 @@ fn neon_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16: SIMD vs scalar mismatch" @@ -72,8 +72,8 @@ fn neon_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16: SIMD vs scalar mismatch" @@ -91,8 +91,8 @@ fn neon_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgb: SIMD vs scalar mismatch"); } @@ -103,8 +103,8 @@ fn neon_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgr48→rgba: SIMD vs scalar mismatch"); } @@ -115,8 +115,8 @@ fn neon_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16: SIMD vs scalar mismatch" @@ -130,8 +130,8 @@ fn neon_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16: SIMD vs scalar mismatch" @@ -149,8 +149,8 @@ fn neon_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgb: SIMD vs scalar mismatch"); } @@ -161,8 +161,8 @@ fn neon_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "rgba64→rgba: SIMD vs scalar mismatch"); } @@ -173,8 +173,8 @@ fn neon_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16: SIMD vs scalar mismatch" @@ -188,8 +188,8 @@ fn neon_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16: SIMD vs scalar mismatch" @@ -207,8 +207,8 @@ fn neon_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { neon_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgb: SIMD vs scalar mismatch"); } @@ -219,8 +219,8 @@ fn neon_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { neon_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!(simd_out, scalar_out, "bgra64→rgba: SIMD vs scalar mismatch"); } @@ -231,8 +231,8 @@ fn neon_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { neon_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16: SIMD vs scalar mismatch" @@ -246,8 +246,8 @@ fn neon_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16: SIMD vs scalar mismatch" @@ -265,8 +265,8 @@ fn neon_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -280,8 +280,8 @@ fn neon_rgba64_to_rgba_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 4]; let mut scalar_out = std::vec![0u8; 8 * 4]; - unsafe { neon_rgba64_to_rgba_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 8); + unsafe { neon_rgba64_to_rgba_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgba exact-8: SIMD vs scalar mismatch" @@ -299,8 +299,8 @@ fn neon_rgb48_to_rgb_width1_scalar_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { neon_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { neon_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -314,8 +314,8 @@ fn neon_bgra64_to_rgba_u16_width1_scalar_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { neon_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { neon_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" diff --git a/src/row/arch/wasm_simd128/packed_rgb.rs b/src/row/arch/wasm_simd128/packed_rgb.rs index 49d1edb9..53644ab0 100644 --- a/src/row/arch/wasm_simd128/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/packed_rgb.rs @@ -623,99 +623,105 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); - - // Extract 10-bit channels as u32x4 (low 10 bits set per lane). - // X2RGB10: R at >>20, G at >>10, B at >>0. - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); - let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); - let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - let b2 = v128_and(p2, mask_3ff); - let b3 = v128_and(p3, mask_3ff); - - // Down-shift 10-bit → 8-bit. - let r0_u8 = u32x4_shr(r0, 2); - let r1_u8 = u32x4_shr(r1, 2); - let r2_u8 = u32x4_shr(r2, 2); - let r3_u8 = u32x4_shr(r3, 2); - let g0_u8 = u32x4_shr(g0, 2); - let g1_u8 = u32x4_shr(g1, 2); - let g2_u8 = u32x4_shr(g2, 2); - let g3_u8 = u32x4_shr(g3, 2); - let b0_u8 = u32x4_shr(b0, 2); - let b1_u8 = u32x4_shr(b1, 2); - let b2_u8 = u32x4_shr(b2, 2); - let b3_u8 = u32x4_shr(b3, 2); - - // u32x4 → u16x8 (saturating narrow). - let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); - let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); - let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); - let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); - let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); - let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); - - // u16x8 → u8x16. - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - // Interleave (R, G, B) into 48 packed bytes via the same - // 9-shuffle pattern used by the YUV→RGB kernels. - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2rgb10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2rgb10.as_ptr().add(x * 4 + 48).cast()); + + // Extract 10-bit channels as u32x4 (low 10 bits set per lane). + // X2RGB10: R at >>20, G at >>10, B at >>0. + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let r2 = v128_and(u32x4_shr(p2, 20), mask_3ff); + let r3 = v128_and(u32x4_shr(p3, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let g2 = v128_and(u32x4_shr(p2, 10), mask_3ff); + let g3 = v128_and(u32x4_shr(p3, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + let b2 = v128_and(p2, mask_3ff); + let b3 = v128_and(p3, mask_3ff); + + // Down-shift 10-bit → 8-bit. + let r0_u8 = u32x4_shr(r0, 2); + let r1_u8 = u32x4_shr(r1, 2); + let r2_u8 = u32x4_shr(r2, 2); + let r3_u8 = u32x4_shr(r3, 2); + let g0_u8 = u32x4_shr(g0, 2); + let g1_u8 = u32x4_shr(g1, 2); + let g2_u8 = u32x4_shr(g2, 2); + let g3_u8 = u32x4_shr(g3, 2); + let b0_u8 = u32x4_shr(b0, 2); + let b1_u8 = u32x4_shr(b1, 2); + let b2_u8 = u32x4_shr(b2, 2); + let b3_u8 = u32x4_shr(b3, 2); + + // u32x4 → u16x8 (saturating narrow). + let r_lo = u16x8_narrow_i32x4(r0_u8, r1_u8); + let r_hi = u16x8_narrow_i32x4(r2_u8, r3_u8); + let g_lo = u16x8_narrow_i32x4(g0_u8, g1_u8); + let g_hi = u16x8_narrow_i32x4(g2_u8, g3_u8); + let b_lo = u16x8_narrow_i32x4(b0_u8, b1_u8); + let b_hi = u16x8_narrow_i32x4(b2_u8, b3_u8); + + // u16x8 → u8x16. + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + // Interleave (R, G, B) into 48 packed bytes via the same + // 9-shuffle pattern used by the YUV→RGB kernels. + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -728,7 +734,11 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// to `0xFF`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -736,36 +746,38 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - - // Extract 10-bit channels into u32 lanes, down-shift to u8. - let r = v128_and(u32x4_shr(pix, 20), mask_3ff); - let g = v128_and(u32x4_shr(pix, 10), mask_3ff); - let b = v128_and(pix, mask_3ff); - let r = u32x4_shr(r, 2); - let g = u32x4_shr(g, 2); - let b = u32x4_shr(b, 2); - - // Pack (R, G, B, 0xFF) bytes per pixel. - // Each channel value is in low byte of its u32 lane. - // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + + // Extract 10-bit channels into u32 lanes, down-shift to u8. + let r = v128_and(u32x4_shr(pix, 20), mask_3ff); + let g = v128_and(u32x4_shr(pix, 10), mask_3ff); + let b = v128_and(pix, mask_3ff); + let r = u32x4_shr(r, 2); + let g = u32x4_shr(g, 2); + let b = u32x4_shr(b, 2); + + // Pack (R, G, B, 0xFF) bytes per pixel. + // Each channel value is in low byte of its u32 lane. + // Shuffle to byte positions: R→[0,4,8,12], G→[1,5,9,13], B→[2,6,10,14], A→[3,7,11,15]. + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -777,72 +789,78 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2RGB10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(p0, mask_3ff); - let b1 = v128_and(p1, mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - // Interleave (R, G, B) u16x8 into 24 u16 elements. - // Element granularity is u16 (2 bytes); shuffle masks below - // index by byte. For u16-per-element interleave, byte mask - // pulls 2 consecutive bytes per element. - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2rgb10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2rgb10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let r1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(p0, mask_3ff); + let b1 = v128_and(p1, mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + // Interleave (R, G, B) u16x8 into 24 u16 elements. + // Element granularity is u16 (2 bytes); shuffle masks below + // index by byte. For u16-per-element interleave, byte mask + // pulls 2 consecutive bytes per element. + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -855,80 +873,86 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// extracts R from low bits and B from high bits. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 16 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); - let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); - - // X2BGR10: R at low 10, G at >>10, B at >>20. - let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); - let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); - let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); - let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); - let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); - let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); - let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); - let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); - let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); - let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); - let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); - let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); - - let r_lo = u16x8_narrow_i32x4(r0, r1); - let r_hi = u16x8_narrow_i32x4(r2, r3); - let g_lo = u16x8_narrow_i32x4(g0, g1); - let g_hi = u16x8_narrow_i32x4(g2, g3); - let b_lo = u16x8_narrow_i32x4(b0, b1); - let b_hi = u16x8_narrow_i32x4(b2, b3); - - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); - let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); - let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), - u8x16_swizzle(b_u8, b_mask0), - ); - let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); - let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); - let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), - u8x16_swizzle(b_u8, b_mask1), - ); - let r_mask2 = i8x16( - -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, - ); - let g_mask2 = i8x16( - -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, - ); - let b_mask2 = i8x16( - 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), - u8x16_swizzle(b_u8, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); - - x += 16; - } + if !BE { + while x + 16 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + let p2 = v128_load(x2bgr10.as_ptr().add(x * 4 + 32).cast()); + let p3 = v128_load(x2bgr10.as_ptr().add(x * 4 + 48).cast()); + + // X2BGR10: R at low 10, G at >>10, B at >>20. + let r0 = u32x4_shr(v128_and(p0, mask_3ff), 2); + let r1 = u32x4_shr(v128_and(p1, mask_3ff), 2); + let r2 = u32x4_shr(v128_and(p2, mask_3ff), 2); + let r3 = u32x4_shr(v128_and(p3, mask_3ff), 2); + let g0 = u32x4_shr(v128_and(u32x4_shr(p0, 10), mask_3ff), 2); + let g1 = u32x4_shr(v128_and(u32x4_shr(p1, 10), mask_3ff), 2); + let g2 = u32x4_shr(v128_and(u32x4_shr(p2, 10), mask_3ff), 2); + let g3 = u32x4_shr(v128_and(u32x4_shr(p3, 10), mask_3ff), 2); + let b0 = u32x4_shr(v128_and(u32x4_shr(p0, 20), mask_3ff), 2); + let b1 = u32x4_shr(v128_and(u32x4_shr(p1, 20), mask_3ff), 2); + let b2 = u32x4_shr(v128_and(u32x4_shr(p2, 20), mask_3ff), 2); + let b3 = u32x4_shr(v128_and(u32x4_shr(p3, 20), mask_3ff), 2); + + let r_lo = u16x8_narrow_i32x4(r0, r1); + let r_hi = u16x8_narrow_i32x4(r2, r3); + let g_lo = u16x8_narrow_i32x4(g0, g1); + let g_hi = u16x8_narrow_i32x4(g2, g3); + let b_lo = u16x8_narrow_i32x4(b0, b1); + let b_hi = u16x8_narrow_i32x4(b2, b3); + + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + let r_mask0 = i8x16(0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1, 5); + let g_mask0 = i8x16(-1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1, -1); + let b_mask0 = i8x16(-1, -1, 0, -1, -1, 1, -1, -1, 2, -1, -1, 3, -1, -1, 4, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask0), u8x16_swizzle(g_u8, g_mask0)), + u8x16_swizzle(b_u8, b_mask0), + ); + let r_mask1 = i8x16(-1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10, -1); + let g_mask1 = i8x16(5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1, 10); + let b_mask1 = i8x16(-1, 5, -1, -1, 6, -1, -1, 7, -1, -1, 8, -1, -1, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask1), u8x16_swizzle(g_u8, g_mask1)), + u8x16_swizzle(b_u8, b_mask1), + ); + let r_mask2 = i8x16( + -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, -1, + ); + let g_mask2 = i8x16( + -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, -1, + ); + let b_mask2 = i8x16( + 10, -1, -1, 11, -1, -1, 12, -1, -1, 13, -1, -1, 14, -1, -1, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r_u8, r_mask2), u8x16_swizzle(g_u8, g_mask2)), + u8x16_swizzle(b_u8, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 32).cast(), out2); + + x += 16; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -941,7 +965,11 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// holds 4 RGBA pixels). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -949,30 +977,32 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi let mask_3ff = u32x4_splat(0x3FF); let alpha_const = u32x4_splat(0xFF00_0000); let mut x = 0usize; - while x + 4 <= width { - let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - - // X2BGR10 channel positions: R at low, G mid, B high. - let r = u32x4_shr(v128_and(pix, mask_3ff), 2); - let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); - let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); - - let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); - let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); - let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); - let out = v128_or( - v128_or( - v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), - u8x16_swizzle(b, b_mask), - ), - alpha_const, - ); - - v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); - x += 4; + if !BE { + while x + 4 <= width { + let pix = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + + // X2BGR10 channel positions: R at low, G mid, B high. + let r = u32x4_shr(v128_and(pix, mask_3ff), 2); + let g = u32x4_shr(v128_and(u32x4_shr(pix, 10), mask_3ff), 2); + let b = u32x4_shr(v128_and(u32x4_shr(pix, 20), mask_3ff), 2); + + let r_mask = i8x16(0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1); + let g_mask = i8x16(-1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1); + let b_mask = i8x16(-1, -1, 0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1); + let out = v128_or( + v128_or( + v128_or(u8x16_swizzle(r, r_mask), u8x16_swizzle(g, g_mask)), + u8x16_swizzle(b, b_mask), + ), + alpha_const, + ); + + v128_store(rgba_out.as_mut_ptr().add(x * 4).cast(), out); + x += 4; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -984,68 +1014,74 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// WASM simd128 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mask_3ff = u32x4_splat(0x3FF); let mut x = 0usize; - while x + 8 <= width { - let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); - let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); - - let r0 = v128_and(p0, mask_3ff); - let r1 = v128_and(p1, mask_3ff); - let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); - let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); - let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); - let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); - - let r = u16x8_narrow_i32x4(r0, r1); - let g = u16x8_narrow_i32x4(g0, g1); - let b = u16x8_narrow_i32x4(b0, b1); - - let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); - let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); - let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); - let out0 = v128_or( - v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), - u8x16_swizzle(b, b_mask0), - ); - // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). - // Each u16 takes 2 bytes; the channel vectors hold element `i` at - // byte indices `(2*i, 2*i+1)`. - let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); - let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); - let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); - let out1 = v128_or( - v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), - u8x16_swizzle(b, b_mask1), - ); - // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). - let r_mask2 = i8x16( - -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, - ); - let g_mask2 = i8x16( - 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, - ); - let b_mask2 = i8x16( - -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, - ); - let out2 = v128_or( - v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), - u8x16_swizzle(b, b_mask2), - ); - - v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); - v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); - - x += 8; - } + if !BE { + while x + 8 <= width { + let p0 = v128_load(x2bgr10.as_ptr().add(x * 4).cast()); + let p1 = v128_load(x2bgr10.as_ptr().add(x * 4 + 16).cast()); + + let r0 = v128_and(p0, mask_3ff); + let r1 = v128_and(p1, mask_3ff); + let g0 = v128_and(u32x4_shr(p0, 10), mask_3ff); + let g1 = v128_and(u32x4_shr(p1, 10), mask_3ff); + let b0 = v128_and(u32x4_shr(p0, 20), mask_3ff); + let b1 = v128_and(u32x4_shr(p1, 20), mask_3ff); + + let r = u16x8_narrow_i32x4(r0, r1); + let g = u16x8_narrow_i32x4(g0, g1); + let b = u16x8_narrow_i32x4(b0, b1); + + let r_mask0 = i8x16(0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5, -1, -1); + let g_mask0 = i8x16(-1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1, 4, 5); + let b_mask0 = i8x16(-1, -1, -1, -1, 0, 1, -1, -1, -1, -1, 2, 3, -1, -1, -1, -1); + let out0 = v128_or( + v128_or(u8x16_swizzle(r, r_mask0), u8x16_swizzle(g, g_mask0)), + u8x16_swizzle(b, b_mask0), + ); + // Block 1 (output u16s 8..15 = [B2, R3, G3, B3, R4, G4, B4, R5]). + // Each u16 takes 2 bytes; the channel vectors hold element `i` at + // byte indices `(2*i, 2*i+1)`. + let r_mask1 = i8x16(-1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1, 10, 11); + let g_mask1 = i8x16(-1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1, -1, -1); + let b_mask1 = i8x16(4, 5, -1, -1, -1, -1, 6, 7, -1, -1, -1, -1, 8, 9, -1, -1); + let out1 = v128_or( + v128_or(u8x16_swizzle(r, r_mask1), u8x16_swizzle(g, g_mask1)), + u8x16_swizzle(b, b_mask1), + ); + // Block 2 (output u16s 16..23 = [G5, B5, R6, G6, B6, R7, G7, B7]). + let r_mask2 = i8x16( + -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, -1, -1, + ); + let g_mask2 = i8x16( + 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, -1, -1, + ); + let b_mask2 = i8x16( + -1, -1, 10, 11, -1, -1, -1, -1, 12, 13, -1, -1, -1, -1, 14, 15, + ); + let out2 = v128_or( + v128_or(u8x16_swizzle(r, r_mask2), u8x16_swizzle(g, g_mask2)), + u8x16_swizzle(b, b_mask2), + ); + + v128_store(rgb_out.as_mut_ptr().add(x * 3).cast(), out0); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 8).cast(), out1); + v128_store(rgb_out.as_mut_ptr().add(x * 3 + 16).cast(), out2); + + x += 8; + } + } // end if !BE if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs index 087eb8f3..4fa3fed5 100644 --- a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs @@ -217,6 +217,24 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { u8x16_narrow_i16x8(shr, zero) } +// ---- endian byte-swap helper ------------------------------------------------- + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Uses `u8x16_swizzle` with a compile-time mask. +#[inline(always)] +unsafe fn byteswap_if_be(v: v128) -> v128 { + if BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + u8x16_swizzle( + v, + i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14), + ) + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -234,7 +252,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -242,9 +264,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -257,7 +279,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -271,7 +293,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -280,9 +306,9 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -294,7 +320,7 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -308,7 +334,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -316,15 +346,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -338,7 +368,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -347,15 +381,15 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -374,7 +408,11 @@ pub(crate) unsafe fn wasm_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -382,9 +420,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -396,7 +434,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -411,7 +449,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -420,9 +462,9 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -433,7 +475,7 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -448,7 +490,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -456,16 +502,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B order write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } @@ -480,7 +526,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -489,16 +539,16 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = v128_load(ptr.cast()); - let v1 = v128_load(ptr.add(8).cast()); - let v2 = v128_load(ptr.add(16).cast()); + let v0 = byteswap_if_be::(v128_load(ptr.cast())); + let v1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let v2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); // Output R, G, B, A order write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -517,7 +567,11 @@ pub(crate) unsafe fn wasm_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -525,10 +579,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -539,7 +593,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -554,7 +608,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -562,10 +620,10 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -577,7 +635,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -592,7 +650,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -600,16 +662,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -624,7 +686,7 @@ pub(crate) unsafe fn wasm_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -636,16 +698,16 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (r, g, b, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,7 +728,11 @@ pub(crate) unsafe fn wasm_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -674,10 +740,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); @@ -689,7 +755,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -704,7 +770,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -712,10 +782,10 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x8_to_u8x8(r); let g_u8 = narrow_u16x8_to_u8x8(g); @@ -727,7 +797,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -742,7 +812,11 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -750,17 +824,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0) let (b, g, r, _a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -775,7 +849,7 @@ pub(crate) unsafe fn wasm_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -787,17 +861,17 @@ pub(crate) unsafe fn wasm_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = byteswap_if_be::(v128_load(ptr.cast())); + let raw1 = byteswap_if_be::(v128_load(ptr.add(8).cast())); + let raw2 = byteswap_if_be::(v128_load(ptr.add(16).cast())); + let raw3 = byteswap_if_be::(v128_load(ptr.add(24).cast())); // Swap B↔R: output (R=ch2, G=ch1, B=ch0, A=ch3) let (b, g, r, a) = deinterleave_rgba64_8px(raw0, raw1, raw2, raw3); write_rgba_u16_8(r, g, b, a, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb.rs b/src/row/arch/wasm_simd128/tests/packed_rgb.rs index dbd979af..6e99d430 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb.rs @@ -207,9 +207,9 @@ fn simd128_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -224,9 +224,9 @@ fn simd128_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_wasm, w); + x2rgb10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -241,9 +241,9 @@ fn simd128_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -258,9 +258,9 @@ fn simd128_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_wasm = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -275,9 +275,9 @@ fn simd128_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_wasm = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_wasm, w); + x2bgr10_to_rgba_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, @@ -292,9 +292,9 @@ fn simd128_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_wasm = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_wasm, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_wasm, w); } assert_eq!( out_scalar, out_wasm, diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs index e2ab1f78..40b1e770 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn wasm_rgb48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xDEAD_BEEF_1234_5678); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb diverges (width={w})"); } } @@ -49,8 +49,8 @@ fn wasm_rgb48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xCAFE_BABE_DEAD_1234); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba diverges (width={w})"); } } @@ -62,8 +62,8 @@ fn wasm_rgb48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xFEED_FACE_ABCD_EF01); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgb_u16 diverges (width={w})"); } } @@ -75,8 +75,8 @@ fn wasm_rgb48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x1234_5678_9ABC_DEF0); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgb48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgb48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgb48→rgba_u16 diverges (width={w})"); } } @@ -92,8 +92,8 @@ fn wasm_bgr48_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 3, 0xABCD_EF01_2345_6789); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb diverges (width={w})"); } } @@ -105,8 +105,8 @@ fn wasm_bgr48_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x9876_5432_10FE_DCBA); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba diverges (width={w})"); } } @@ -118,8 +118,8 @@ fn wasm_bgr48_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x0011_2233_4455_6677); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgb_u16 diverges (width={w})"); } } @@ -131,8 +131,8 @@ fn wasm_bgr48_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 3, 0x8899_AABB_CCDD_EEFF); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgr48_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgr48_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgr48→rgba_u16 diverges (width={w})"); } } @@ -148,8 +148,8 @@ fn wasm_rgba64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xF0F0_F0F0_0F0F_0F0F); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb diverges (width={w})"); } } @@ -161,8 +161,8 @@ fn wasm_rgba64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x1357_9BDF_2468_ACE0); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba diverges (width={w})"); } } @@ -174,8 +174,8 @@ fn wasm_rgba64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x2468_ACE0_1357_9BDF); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgb_u16 diverges (width={w})"); } } @@ -187,8 +187,8 @@ fn wasm_rgba64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x3C3C_C3C3_5A5A_A5A5); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_rgba64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_rgba64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "rgba64→rgba_u16 diverges (width={w})"); } } @@ -204,8 +204,8 @@ fn wasm_bgra64_to_rgb_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x7654_3210_FEDC_BA98); let mut scalar_out = std::vec![0u8; w * 3]; let mut simd_out = std::vec![0u8; w * 3]; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb diverges (width={w})"); } } @@ -217,8 +217,8 @@ fn wasm_bgra64_to_rgba_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xAABB_CCDD_EEFF_0011); let mut scalar_out = std::vec![0u8; w * 4]; let mut simd_out = std::vec![0u8; w * 4]; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba diverges (width={w})"); } } @@ -230,8 +230,8 @@ fn wasm_bgra64_to_rgb_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0x5566_7788_99AA_BBCC); let mut scalar_out = std::vec![0u16; w * 3]; let mut simd_out = std::vec![0u16; w * 3]; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgb_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgb_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgb_u16 diverges (width={w})"); } } @@ -243,8 +243,8 @@ fn wasm_bgra64_to_rgba_u16_matches_scalar() { let src = pseudo_random_u16(w * 4, 0xDDEE_FF00_1122_3344); let mut scalar_out = std::vec![0u16; w * 4]; let mut simd_out = std::vec![0u16; w * 4]; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, w); - unsafe { wasm_bgra64_to_rgba_u16_row(&src, &mut simd_out, w) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, w); + unsafe { wasm_bgra64_to_rgba_u16_row::(&src, &mut simd_out, w) }; assert_eq!(scalar_out, simd_out, "bgra64→rgba_u16 diverges (width={w})"); } } diff --git a/src/row/arch/x86_avx2/packed_rgb.rs b/src/row/arch/x86_avx2/packed_rgb.rs index bfae38a0..b90174f8 100644 --- a/src/row/arch/x86_avx2/packed_rgb.rs +++ b/src/row/arch/x86_avx2/packed_rgb.rs @@ -445,21 +445,27 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -471,21 +477,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2RGB10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -497,21 +509,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2RGB10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -523,21 +541,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX2 X2BGR10→RGB. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -549,21 +573,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX2 X2BGR10→RGBA. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -575,21 +605,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX2 X2BGR10→u16 RGB native. 16 pixels per iteration. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x += 16; + if !BE { + while x + 16 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx2/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/packed_rgb_16bit.rs index 086a689f..db9343e2 100644 --- a/src/row/arch/x86_avx2/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/packed_rgb_16bit.rs @@ -297,6 +297,41 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3 subset of AVX2). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Byte-swap every u16 lane in a `__m256i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm256_shuffle_epi8` (AVX2). +#[inline(always)] +unsafe fn byteswap256_if_be(v: __m256i) -> __m256i { + if BE { + // Same u16-lane byte-swap mask, broadcast to both 128-bit lanes. + const MASK: __m256i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm256_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -307,6 +342,7 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// target_feature, exploiting that SSE4.1/SSSE3 are AVX2 subsets. Each half /// deinterleaves with shuffle masks, narrows via `>> 8`, writes 8 pixels /// (24 bytes). 16 pixels are produced per outer loop iteration. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -315,7 +351,11 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -327,9 +367,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi let ptr = rgb48.as_ptr().add(x * 3); // First half: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -340,9 +380,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi // Second half: pixels x+8..x+15 let ptr8 = ptr.add(24); // 24 u16 ahead = 8 pixels × 3 channels - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -355,13 +395,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi } // Handle remaining pixels (< 16) via scalar fallback. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → packed u8 RGBA. 16 pixels per outer iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -369,7 +411,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -381,9 +427,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -393,9 +439,9 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -407,13 +453,15 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGB (identity repack). 16 pixels per iteration. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -421,7 +469,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -430,29 +482,31 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgb48 → native-depth u16 RGBA. 16 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -460,7 +514,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -470,23 +528,23 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -499,6 +557,7 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -507,7 +566,11 @@ pub(crate) unsafe fn avx2_rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -517,9 +580,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -529,9 +592,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgb_out.as_mut_ptr().add(x * 3), 24); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -543,13 +606,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi x += 16; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → packed u8 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -558,7 +622,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], wi /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -570,9 +638,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -582,9 +650,9 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], core::ptr::copy_nonoverlapping(tmp0.as_ptr(), rgba_out.as_mut_ptr().add(x * 4), 32); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -596,13 +664,14 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 16; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGB. 16 pixels per outer iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -611,7 +680,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -620,29 +693,30 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r0, g0, b0, rgb_out.as_mut_ptr().add(x * 3)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgb_u16_8(r1, g1, b1, rgb_out.as_mut_ptr().add((x + 8) * 3)); x += 16; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgr48 → native-depth u16 RGBA. 16 pixels per outer iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -651,7 +725,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -661,23 +739,23 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u while x + 16 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b0, g0, r0) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r0, g0, b0, opaque, rgba_out.as_mut_ptr().add(x * 4)); let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (b1, g1, r1) = deinterleave_rgb48_8px(v3, v4, v5); write_rgba_u16_8(r1, g1, b1, opaque, rgba_out.as_mut_ptr().add((x + 8) * 4)); x += 16; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -691,6 +769,7 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// Loads 4 × `__m256i` (64 u16 = 16 pixels), deinterleaves via the /// cascade helper, narrows via `>> 8` + `packus_epi16` + lane fix, writes /// 16 pixels (48 bytes) via `write_rgb_16` on the low 128 bits. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -699,7 +778,11 @@ pub(crate) unsafe fn avx2_bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -708,10 +791,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -720,13 +803,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -735,7 +819,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -744,10 +832,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -757,13 +845,15 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGB. 16 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX2 must be available. @@ -771,7 +861,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -779,10 +873,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Write in two 8-pixel halves using the existing 128-bit helper. write_rgb_u16_8( @@ -800,13 +894,14 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Rgba64 → native-depth u16 RGBA (identity copy). 16 pixels per iteration. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -815,7 +910,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -827,10 +922,10 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( _mm256_castsi256_si128(r_u16), @@ -849,7 +944,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( x += 16; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -862,6 +957,7 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_16px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -870,7 +966,11 @@ pub(crate) unsafe fn avx2_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -879,10 +979,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); @@ -892,13 +992,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 16; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → packed u8 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -907,7 +1008,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -916,10 +1021,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x16_to_u8x16(r_u16, zero256); let g_u8 = narrow_u16x16_to_u8x16(g_u16, zero256); @@ -929,13 +1034,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] x += 16; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGB. 16 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -944,7 +1050,11 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -952,10 +1062,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); // Swap B↔R: store (R, G, B) write_rgb_u16_8( @@ -973,13 +1083,14 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u x += 16; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX2 Bgra64 → native-depth u16 RGBA. 16 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -988,7 +1099,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1000,10 +1111,10 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm256_loadu_si256(ptr.cast()); - let raw1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw3 = _mm256_loadu_si256(ptr.add(48).cast()); + let raw0 = byteswap256_if_be::(_mm256_loadu_si256(ptr.cast())); + let raw1 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(16).cast())); + let raw2 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(32).cast())); + let raw3 = byteswap256_if_be::(_mm256_loadu_si256(ptr.add(48).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_16px(raw0, raw1, raw2, raw3); write_rgba_u16_8( @@ -1023,7 +1134,7 @@ pub(crate) unsafe fn avx2_bgra64_to_rgba_u16_row( x += 16; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx2/tests/packed_rgb.rs b/src/row/arch/x86_avx2/tests/packed_rgb.rs index 981c50e0..16ea736c 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb.rs @@ -231,9 +231,9 @@ fn avx2_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -251,9 +251,9 @@ fn avx2_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -271,9 +271,9 @@ fn avx2_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -291,9 +291,9 @@ fn avx2_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -311,9 +311,9 @@ fn avx2_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -331,9 +331,9 @@ fn avx2_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs index 1490d6e1..9dceec81 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs @@ -63,8 +63,8 @@ fn avx2_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -80,8 +80,8 @@ fn avx2_rgb48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xF0F0); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-16: SIMD vs scalar mismatch" @@ -97,8 +97,8 @@ fn avx2_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -115,8 +115,8 @@ fn avx2_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -136,8 +136,8 @@ fn avx2_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -157,8 +157,8 @@ fn avx2_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -175,8 +175,8 @@ fn avx2_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -196,8 +196,8 @@ fn avx2_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -217,8 +217,8 @@ fn avx2_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn avx2_bgr48_to_rgb_exact16_matches_scalar() { let src = make_rgb48_src(16, 0xA1A1); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-16: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx2_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(17); // reuse helper (ch0 treated as B) let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx2_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx2_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx2_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx2_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx2_rgba64_to_rgb_exact16_matches_scalar() { let src = make_rgba64_src(16, 0x0F0F); let mut simd_out = std::vec![0u8; 16 * 3]; let mut scalar_out = std::vec![0u8; 16 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 16) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 16); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 16) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 16); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-16: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx2_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx2_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx2_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx2_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx2_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx2_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx2_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx2_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx2_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx2_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx2_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx2_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx2_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx2_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx2_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx2_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(17); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -722,7 +722,7 @@ fn avx2_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -739,7 +739,7 @@ fn avx2_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -755,7 +755,7 @@ fn avx2_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 4]; - unsafe { avx2_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..17 { @@ -774,7 +774,7 @@ fn avx2_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(17); let mut simd_out = std::vec![0u16; 17 * 3]; - unsafe { avx2_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; + unsafe { avx2_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; for n in 0..17 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); diff --git a/src/row/arch/x86_avx512/packed_rgb.rs b/src/row/arch/x86_avx512/packed_rgb.rs index 164804d6..84d6e8c6 100644 --- a/src/row/arch/x86_avx512/packed_rgb.rs +++ b/src/row/arch/x86_avx512/packed_rgb.rs @@ -446,23 +446,29 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// [`super::x86_common::x2rgb10_to_rgb_16_pixels`]. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2rgb10_to_rgb_16_pixels(base_in, base_out); - x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2rgb10_to_rgb_16_pixels(base_in, base_out); + x2rgb10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2rgb10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2rgb10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -474,23 +480,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2RGB10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2rgb10_to_rgba_16_pixels(base_in, base_out); - x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2rgb10_to_rgba_16_pixels(base_in, base_out); + x2rgb10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2rgb10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2rgb10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -502,23 +514,29 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2RGB10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2rgb10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2rgb10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2rgb10_to_rgb_u16_8_pixels(base_in, base_out); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2rgb10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -530,23 +548,29 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// AVX-512 X2BGR10→RGB. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3); - x2bgr10_to_rgb_16_pixels(base_in, base_out); - x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); - x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); - x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3); + x2bgr10_to_rgb_16_pixels(base_in, base_out); + x2bgr10_to_rgb_16_pixels(base_in.add(64), base_out.add(48)); + x2bgr10_to_rgb_16_pixels(base_in.add(128), base_out.add(96)); + x2bgr10_to_rgb_16_pixels(base_in.add(192), base_out.add(144)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -558,23 +582,29 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// AVX-512 X2BGR10→RGBA. 64 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 64 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgba_out.as_mut_ptr().add(x * 4); - x2bgr10_to_rgba_16_pixels(base_in, base_out); - x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); - x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); - x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); - x += 64; + if !BE { + while x + 64 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgba_out.as_mut_ptr().add(x * 4); + x2bgr10_to_rgba_16_pixels(base_in, base_out); + x2bgr10_to_rgba_16_pixels(base_in.add(64), base_out.add(64)); + x2bgr10_to_rgba_16_pixels(base_in.add(128), base_out.add(128)); + x2bgr10_to_rgba_16_pixels(base_in.add(192), base_out.add(192)); + x += 64; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -586,23 +616,29 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// AVX-512 X2BGR10→u16 RGB native. 32 pixels per iteration. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 32 <= width { - let base_in = x2bgr10.as_ptr().add(x * 4); - let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); - x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); - x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); - x += 32; + if !BE { + while x + 32 <= width { + let base_in = x2bgr10.as_ptr().add(x * 4); + let base_out = rgb_out.as_mut_ptr().add(x * 3).cast::(); + x2bgr10_to_rgb_u16_8_pixels(base_in, base_out); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(32), base_out.add(48)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(64), base_out.add(96)); + x2bgr10_to_rgb_u16_8_pixels(base_in.add(96), base_out.add(144)); + x += 32; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_avx512/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/packed_rgb_16bit.rs index 243fff83..3b000cbb 100644 --- a/src/row/arch/x86_avx512/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/packed_rgb_16bit.rs @@ -240,6 +240,42 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { unsafe { _mm512_cvtusepi16_epi8(_mm512_srli_epi16::<8>(v)) } } +// ---- endian byte-swap helpers ----------------------------------------------- + +/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of AVX-512). +#[inline(always)] +unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { + if BE { + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + +/// Byte-swap every u16 lane in a `__m512i` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm512_shuffle_epi8` (AVX-512BW). +#[inline(always)] +unsafe fn byteswap512_if_be(v: __m512i) -> __m512i { + if BE { + // Same u16-lane byte-swap mask, broadcast across all 64 bytes. + const MASK: __m512i = unsafe { + core::mem::transmute([ + 1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, + 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, + 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm512_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -249,6 +285,7 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// Processes four 8-pixel halves (3 × 128-bit loads each) under the /// AVX-512 target_feature context (SSE4.1/SSSE3 are subsets). Narrows /// each channel via `>> 8` and writes 8 pixels (24 bytes) per half. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -257,7 +294,11 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -268,9 +309,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], while x + 32 <= width { let ptr = rgb48.as_ptr().add(x * 3); // Half 0: pixels x..x+7 - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r0, g0, b0) = deinterleave_rgb48_8px(v0, v1, v2); let r0u8 = narrow_u16x8_to_u8x8(r0, zero); let g0u8 = narrow_u16x8_to_u8x8(g0, zero); @@ -281,9 +322,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 1: pixels x+8..x+15 let ptr8 = ptr.add(24); - let v3 = _mm_loadu_si128(ptr8.cast()); - let v4 = _mm_loadu_si128(ptr8.add(8).cast()); - let v5 = _mm_loadu_si128(ptr8.add(16).cast()); + let v3 = byteswap128_if_be::(_mm_loadu_si128(ptr8.cast())); + let v4 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(8).cast())); + let v5 = byteswap128_if_be::(_mm_loadu_si128(ptr8.add(16).cast())); let (r1, g1, b1) = deinterleave_rgb48_8px(v3, v4, v5); let r1u8 = narrow_u16x8_to_u8x8(r1, zero); let g1u8 = narrow_u16x8_to_u8x8(g1, zero); @@ -294,9 +335,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 2: pixels x+16..x+23 let ptr16 = ptr.add(48); - let v6 = _mm_loadu_si128(ptr16.cast()); - let v7 = _mm_loadu_si128(ptr16.add(8).cast()); - let v8 = _mm_loadu_si128(ptr16.add(16).cast()); + let v6 = byteswap128_if_be::(_mm_loadu_si128(ptr16.cast())); + let v7 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(8).cast())); + let v8 = byteswap128_if_be::(_mm_loadu_si128(ptr16.add(16).cast())); let (r2, g2, b2) = deinterleave_rgb48_8px(v6, v7, v8); let r2u8 = narrow_u16x8_to_u8x8(r2, zero); let g2u8 = narrow_u16x8_to_u8x8(g2, zero); @@ -307,9 +348,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], // Half 3: pixels x+24..x+31 let ptr24 = ptr.add(72); - let v9 = _mm_loadu_si128(ptr24.cast()); - let v10 = _mm_loadu_si128(ptr24.add(8).cast()); - let v11 = _mm_loadu_si128(ptr24.add(16).cast()); + let v9 = byteswap128_if_be::(_mm_loadu_si128(ptr24.cast())); + let v10 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(8).cast())); + let v11 = byteswap128_if_be::(_mm_loadu_si128(ptr24.add(16).cast())); let (r3, g3, b3) = deinterleave_rgb48_8px(v9, v10, v11); let r3u8 = narrow_u16x8_to_u8x8(r3, zero); let g3u8 = narrow_u16x8_to_u8x8(g3, zero); @@ -322,13 +363,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], } // Scalar tail: remaining < 32 pixels. if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → packed u8 RGBA. 32 pixels per outer iteration. Alpha /// forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -337,7 +379,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -351,9 +397,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] macro_rules! process_half { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -372,13 +418,15 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGB (identity repack). 32 pixels per iter. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. AVX-512F + AVX-512BW must be available. @@ -386,7 +434,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -397,9 +449,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u macro_rules! process_half_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -413,13 +465,14 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgb48 → native-depth u16 RGBA. 32 pixels per iter. Alpha forced to /// 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -428,7 +481,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -444,9 +497,9 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( macro_rules! process_half_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (r, g, b) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -460,7 +513,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -471,6 +524,7 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// AVX-512 Bgr48 → packed u8 RGB. 32 pixels per outer iteration. /// B↔R swap via passing `(ch2, ch1, ch0)` to write helpers. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -479,7 +533,11 @@ pub(crate) unsafe fn avx512_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -491,9 +549,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], macro_rules! process_half_bgr { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -512,13 +570,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], x += 32; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → packed u8 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -527,7 +586,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -541,9 +604,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] macro_rules! process_half_bgr_rgba { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); let ru8 = narrow_u16x8_to_u8x8(r, zero); let gu8 = narrow_u16x8_to_u8x8(g, zero); @@ -562,13 +625,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] x += 32; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGB. 32 pixels per iter. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -577,7 +641,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8] /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -588,9 +656,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u macro_rules! process_half_bgr_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r, g, b, rgb_out.as_mut_ptr().add($out_off)); }}; @@ -604,13 +672,14 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u x += 32; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgr48 → native-depth u16 RGBA. 32 pixels per iter. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -619,7 +688,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -635,9 +704,9 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( macro_rules! process_half_bgr_rgba_u16 { ($ptr:expr, $out_off:expr) => {{ - let v0 = _mm_loadu_si128($ptr.cast()); - let v1 = _mm_loadu_si128($ptr.add(8).cast()); - let v2 = _mm_loadu_si128($ptr.add(16).cast()); + let v0 = byteswap128_if_be::(_mm_loadu_si128($ptr.cast())); + let v1 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(8).cast())); + let v2 = byteswap128_if_be::(_mm_loadu_si128($ptr.add(16).cast())); let (b, g, r) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8(r, g, b, opaque, rgba_out.as_mut_ptr().add($out_off)); }}; @@ -651,7 +720,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -666,6 +735,7 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 32 pixels (96 bytes) via `write_rgb_16` on 128-bit quarters. /// /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -674,7 +744,11 @@ pub(crate) unsafe fn avx512_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -682,10 +756,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -707,13 +781,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// Source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -722,7 +797,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -730,10 +809,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -757,13 +836,14 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// Alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -772,7 +852,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -784,23 +864,24 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); // Use the shared write_rgb_u16_32 helper (writes 32 px = 4 × 8-px chunks). write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Rgba64 → native-depth u16 RGBA (identity copy). 32 pixels per iter. /// Source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -809,7 +890,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -821,10 +902,10 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let opaque = _mm_set1_epi16(-1i16); // 0xFFFF placeholder — not used; a_u16 has real alpha let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -862,7 +943,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( x += 32; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -875,6 +956,7 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_32px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -883,7 +965,11 @@ pub(crate) unsafe fn avx512_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -891,10 +977,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); @@ -916,13 +1002,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] x += 32; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → packed u8 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -931,7 +1018,11 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8] /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn avx512_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -939,10 +1030,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let r_u8 = narrow_u16x32_to_u8x32(r_u16); let g_u8 = narrow_u16x32_to_u8x32(g_u16); @@ -966,13 +1057,14 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u x += 32; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGB. 32 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -981,7 +1073,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -993,23 +1085,24 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); write_rgb_u16_32(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 32; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// AVX-512 Bgra64 → native-depth u16 RGBA. 32 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -1018,7 +1111,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1030,10 +1123,10 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let raw0 = _mm512_loadu_si512(ptr.cast()); - let raw1 = _mm512_loadu_si512(ptr.add(32).cast()); - let raw2 = _mm512_loadu_si512(ptr.add(64).cast()); - let raw3 = _mm512_loadu_si512(ptr.add(96).cast()); + let raw0 = byteswap512_if_be::(_mm512_loadu_si512(ptr.cast())); + let raw1 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(32).cast())); + let raw2 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(64).cast())); + let raw3 = byteswap512_if_be::(_mm512_loadu_si512(ptr.add(96).cast())); // Swap B↔R: (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_32px(raw0, raw1, raw2, raw3); let out_ptr = rgba_out.as_mut_ptr().add(x * 4); @@ -1068,7 +1161,7 @@ pub(crate) unsafe fn avx512_bgra64_to_rgba_u16_row( x += 32; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_avx512/tests/packed_rgb.rs b/src/row/arch/x86_avx512/tests/packed_rgb.rs index 1cb18dbb..4fb00aa2 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn avx512_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -263,9 +263,9 @@ fn avx512_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_avx, w); + x2rgb10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -283,9 +283,9 @@ fn avx512_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_avx, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -303,9 +303,9 @@ fn avx512_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_avx = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -323,9 +323,9 @@ fn avx512_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_avx = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_avx, w); + x2bgr10_to_rgba_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, @@ -343,9 +343,9 @@ fn avx512_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_avx = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_avx, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_avx, w); } assert_eq!( out_scalar, out_avx, diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs index fe4c2536..4ae0709b 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs @@ -65,8 +65,8 @@ fn avx512_rgb48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0101); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=33: SIMD vs scalar mismatch" @@ -82,8 +82,8 @@ fn avx512_rgb48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xF0F0); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-32: SIMD vs scalar mismatch" @@ -99,8 +99,8 @@ fn avx512_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -117,8 +117,8 @@ fn avx512_rgb48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb lane order: SIMD vs scalar mismatch (channel swap?)" @@ -138,8 +138,8 @@ fn avx512_rgb48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0303); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=33: SIMD vs scalar mismatch" @@ -159,8 +159,8 @@ fn avx512_rgb48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0505); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -177,8 +177,8 @@ fn avx512_rgb48_to_rgb_u16_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgb48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 lane order: SIMD vs scalar mismatch (channel swap?)" @@ -198,8 +198,8 @@ fn avx512_rgb48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x0707); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgb48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -219,8 +219,8 @@ fn avx512_bgr48_to_rgb_matches_scalar_width33() { let src = make_rgb48_src(33, 0x1111); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=33: SIMD vs scalar mismatch" @@ -236,8 +236,8 @@ fn avx512_bgr48_to_rgb_exact32_matches_scalar() { let src = make_rgb48_src(32, 0xA1A1); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-32: SIMD vs scalar mismatch" @@ -254,8 +254,8 @@ fn avx512_bgr48_to_rgb_lane_order_regression() { let src = make_rgb48_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -275,8 +275,8 @@ fn avx512_bgr48_to_rgba_matches_scalar_width33() { let src = make_rgb48_src(33, 0x2222); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=33: SIMD vs scalar mismatch" @@ -296,8 +296,8 @@ fn avx512_bgr48_to_rgb_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x3333); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgr48_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -317,8 +317,8 @@ fn avx512_bgr48_to_rgba_u16_matches_scalar_width33() { let src = make_rgb48_src(33, 0x4444); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgr48_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -338,8 +338,8 @@ fn avx512_rgba64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0xAAAA); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=33: SIMD vs scalar mismatch" @@ -355,8 +355,8 @@ fn avx512_rgba64_to_rgb_exact32_matches_scalar() { let src = make_rgba64_src(32, 0x0F0F); let mut simd_out = std::vec![0u8; 32 * 3]; let mut scalar_out = std::vec![0u8; 32 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 32) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 32); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 32) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 32); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-32: SIMD vs scalar mismatch" @@ -373,8 +373,8 @@ fn avx512_rgba64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb lane order: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn avx512_rgba64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0xBBBB); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=33: SIMD vs scalar mismatch" @@ -412,8 +412,8 @@ fn avx512_rgba64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba lane order (alpha passthrough): SIMD vs scalar mismatch" @@ -433,8 +433,8 @@ fn avx512_rgba64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xCCCC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -451,8 +451,8 @@ fn avx512_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -472,8 +472,8 @@ fn avx512_rgba64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDDDD); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -489,8 +489,8 @@ fn avx512_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -507,8 +507,8 @@ fn avx512_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order (identity copy): SIMD vs scalar mismatch" @@ -528,8 +528,8 @@ fn avx512_bgra64_to_rgb_matches_scalar_width33() { let src = make_rgba64_src(33, 0x1234); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=33: SIMD vs scalar mismatch" @@ -546,8 +546,8 @@ fn avx512_bgra64_to_rgb_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 3]; let mut scalar_out = std::vec![0u8; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb lane order (B↔R swap): SIMD vs scalar mismatch" @@ -567,8 +567,8 @@ fn avx512_bgra64_to_rgba_matches_scalar_width33() { let src = make_rgba64_src(33, 0x5678); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=33: SIMD vs scalar mismatch" @@ -585,8 +585,8 @@ fn avx512_bgra64_to_rgba_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u8; 33 * 4]; let mut scalar_out = std::vec![0u8; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba lane order (B↔R swap + alpha): SIMD vs scalar mismatch" @@ -606,8 +606,8 @@ fn avx512_bgra64_to_rgb_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0x9ABC); let mut simd_out = std::vec![0u16; 33 * 3]; let mut scalar_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=33: SIMD vs scalar mismatch" @@ -627,8 +627,8 @@ fn avx512_bgra64_to_rgba_u16_matches_scalar_width33() { let src = make_rgba64_src(33, 0xDEF0); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=33: SIMD vs scalar mismatch" @@ -644,8 +644,8 @@ fn avx512_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" @@ -662,8 +662,8 @@ fn avx512_bgra64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_asymmetric(33); let mut simd_out = std::vec![0u16; 33 * 4]; let mut scalar_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 33); + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 33); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order (B↔R swap + alpha preserve): SIMD vs scalar mismatch" @@ -720,7 +720,7 @@ fn avx512_rgba64_to_rgba_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_rgba64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 4], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 4 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -737,7 +737,7 @@ fn avx512_rgba64_to_rgb_u16_lane_order_handcheck() { } let src = make_rgba64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_rgba64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); @@ -753,7 +753,7 @@ fn avx512_bgra64_to_rgba_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 4]; - unsafe { avx512_bgra64_to_rgba_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 33) }; // Output is RGBA: R=n+1, G=100+n, B=200+n, A=50+n per pixel n // (B↔R swap from source memory order). for n in 0..33 { @@ -772,7 +772,7 @@ fn avx512_bgra64_to_rgb_u16_lane_order_handcheck() { } let src = make_bgra64_lane_order(33); let mut simd_out = std::vec![0u16; 33 * 3]; - unsafe { avx512_bgra64_to_rgb_u16_row(&src, &mut simd_out, 33) }; + unsafe { avx512_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 33) }; for n in 0..33 { assert_eq!(simd_out[n * 3], (n as u16) + 1, "R at pixel {n}"); assert_eq!(simd_out[n * 3 + 1], (n as u16) + 100, "G at pixel {n}"); diff --git a/src/row/arch/x86_sse41/packed_rgb.rs b/src/row/arch/x86_sse41/packed_rgb.rs index e5bb35e8..12dccd40 100644 --- a/src/row/arch/x86_sse41/packed_rgb.rs +++ b/src/row/arch/x86_sse41/packed_rgb.rs @@ -426,18 +426,24 @@ pub(crate) unsafe fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: u /// 3. `x2rgb10` / `rgb_out` must not alias. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgb_16_pixels(x2rgb10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgb_row( + scalar::x2rgb10_to_rgb_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -449,21 +455,27 @@ pub(crate) unsafe fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2RGB10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2rgb10_to_rgba_16_pixels( - x2rgb10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2rgb10_to_rgba_16_pixels( + x2rgb10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2rgb10_to_rgba_row( + scalar::x2rgb10_to_rgba_row::( &x2rgb10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -476,21 +488,27 @@ pub(crate) unsafe fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], wi /// `u16`, max value `1023`). 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2rgb10_to_rgb_u16_8_pixels( - x2rgb10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2rgb10_to_rgb_u16_8_pixels( + x2rgb10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2rgb10_to_rgb_u16_row( + scalar::x2rgb10_to_rgb_u16_row::( &x2rgb10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -502,18 +520,24 @@ pub(crate) unsafe fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], /// SSE4.1 X2BGR10→RGB. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgb_16_pixels(x2bgr10.as_ptr().add(x * 4), rgb_out.as_mut_ptr().add(x * 3)); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgb_row( + scalar::x2bgr10_to_rgb_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, @@ -525,21 +549,27 @@ pub(crate) unsafe fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], widt /// SSE4.1 X2BGR10→RGBA. 16 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); unsafe { let mut x = 0usize; - while x + 16 <= width { - x2bgr10_to_rgba_16_pixels( - x2bgr10.as_ptr().add(x * 4), - rgba_out.as_mut_ptr().add(x * 4), - ); - x += 16; + if !BE { + while x + 16 <= width { + x2bgr10_to_rgba_16_pixels( + x2bgr10.as_ptr().add(x * 4), + rgba_out.as_mut_ptr().add(x * 4), + ); + x += 16; + } } if x < width { - scalar::x2bgr10_to_rgba_row( + scalar::x2bgr10_to_rgba_row::( &x2bgr10[x * 4..width * 4], &mut rgba_out[x * 4..width * 4], width - x, @@ -551,21 +581,27 @@ pub(crate) unsafe fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], wi /// SSE4.1 X2BGR10→u16 RGB native. 8 pixels per iteration. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); unsafe { let mut x = 0usize; - while x + 8 <= width { - x2bgr10_to_rgb_u16_8_pixels( - x2bgr10.as_ptr().add(x * 4), - rgb_out.as_mut_ptr().add(x * 3).cast::(), - ); - x += 8; + if !BE { + while x + 8 <= width { + x2bgr10_to_rgb_u16_8_pixels( + x2bgr10.as_ptr().add(x * 4), + rgb_out.as_mut_ptr().add(x * 3).cast::(), + ); + x += 8; + } } if x < width { - scalar::x2bgr10_to_rgb_u16_row( + scalar::x2bgr10_to_rgb_u16_row::( &x2bgr10[x * 4..width * 4], &mut rgb_out[x * 3..width * 3], width - x, diff --git a/src/row/arch/x86_sse41/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/packed_rgb_16bit.rs index b9dc50fa..c9a8bff4 100644 --- a/src/row/arch/x86_sse41/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/packed_rgb_16bit.rs @@ -306,6 +306,24 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { unsafe { _mm_packus_epi16(_mm_srli_epi16::<8>(v), zero) } } +// ---- endian byte-swap helper ------------------------------------------------ + +/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// +/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of SSE4.1) with the same mask as +/// `endian::BYTESWAP_MASK_U16`. +#[inline(always)] +unsafe fn byteswap_if_be(v: __m128i) -> __m128i { + if BE { + // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] + const MASK: __m128i = + unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; + unsafe { _mm_shuffle_epi8(v, MASK) } + } else { + v + } +} + // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -314,6 +332,7 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// /// Loads 3 × 128-bit chunks (24 u16), deinterleaves with shuffle masks, /// narrows via `>> 8`, writes 8 pixels (24 bytes) of interleaved RGB. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -322,7 +341,11 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -331,9 +354,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -345,13 +368,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::rgb48_to_rgb_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → packed u8 RGBA. 8 pixels per SIMD iteration. Alpha forced to 0xFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -359,7 +384,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -370,9 +399,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -383,14 +412,15 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::rgb48_to_rgba_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } -/// SSE4.1 Rgb48 → native-depth u16 RGB (identity repack). 8 pixels per iteration. +/// SSE4.1 Rgb48 → native-depth u16 RGB. 8 pixels per iteration. /// /// Deinterleaves with shuffle masks, writes 8 pixels via `write_rgb_u16_8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -399,7 +429,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -407,21 +441,23 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgb48_to_rgb_u16_row(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::rgb48_to_rgb_u16_row::(&rgb48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Rgb48 → native-depth u16 RGBA. 8 pixels per iteration. Alpha forced to 0xFFFF. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -429,7 +465,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -442,9 +478,9 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgb48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (r_u16, g_u16, b_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -456,7 +492,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( x += 8; } if x < width { - scalar::rgb48_to_rgba_u16_row(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::rgb48_to_rgba_u16_row::(&rgb48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -469,6 +505,7 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// /// `deinterleave_rgb48_8px` yields `(B, G, R)` in source memory order; /// the B↔R swap is applied by passing them as `(R=ch2, G=ch1, B=ch0)`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -477,7 +514,11 @@ pub(crate) unsafe fn sse41_rgb48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -486,9 +527,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); // ch0=B, ch1=G, ch2=R (source BGR order) let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -500,13 +541,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w x += 8; } if x < width { - scalar::bgr48_to_rgb_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -515,7 +557,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], w /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -526,9 +572,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -539,13 +585,14 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], x += 8; } if x < width { - scalar::bgr48_to_rgba_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; values unchanged. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -554,7 +601,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize) { +pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -562,22 +613,23 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); // Store as R, G, B (swap applied by argument order) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgr48_to_rgb_u16_row(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); + scalar::bgr48_to_rgb_u16_row::(&bgr48[x * 3..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgr48 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; alpha forced to 0xFFFF. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -586,7 +638,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u1 /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -599,9 +651,9 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgr48.as_ptr().add(x * 3); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); let (b_u16, g_u16, r_u16) = deinterleave_rgb48_8px(v0, v1, v2); write_rgba_u16_8( r_u16, @@ -613,7 +665,7 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( x += 8; } if x < width { - scalar::bgr48_to_rgba_u16_row(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); + scalar::bgr48_to_rgba_u16_row::(&bgr48[x * 3..], &mut rgba_out[x * 4..], width - x); } } } @@ -624,6 +676,8 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// SSE4.1 Rgba64 → packed u8 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -631,7 +685,11 @@ pub(crate) unsafe fn sse41_bgr48_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -640,10 +698,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -654,7 +712,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::rgba64_to_rgb_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -662,6 +720,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// SSE4.1 Rgba64 → packed u8 RGBA. 8 pixels per SIMD iteration. Source alpha passes through. /// /// All 4 channels narrowed via `>> 8`. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -670,7 +729,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -679,10 +742,10 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -694,13 +757,15 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::rgba64_to_rgba_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Rgba64 → native-depth u16 RGB. 8 pixels per SIMD iteration. Alpha discarded. /// +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. +/// /// # Safety /// /// 1. SSE4.1 must be available. @@ -708,7 +773,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -720,16 +785,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::rgba64_to_rgb_u16_row(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::rgba64_to_rgb_u16_row::(&rgba64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } @@ -737,6 +802,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// SSE4.1 Rgba64 → native-depth u16 RGBA (identity copy). 8 pixels per iteration. /// /// All 4 channels passed through at native depth; source alpha preserved. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -745,7 +811,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -757,16 +823,16 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = rgba64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (r_u16, g_u16, b_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::rgba64_to_rgba_u16_row(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::rgba64_to_rgba_u16_row::(&rgba64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } @@ -779,6 +845,7 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// B↔R swap; alpha discarded. /// /// `deinterleave_rgba64_8px` yields `(B, G, R, A)` in source memory order. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -787,7 +854,11 @@ pub(crate) unsafe fn sse41_rgba64_to_rgba_u16_row( /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); @@ -796,10 +867,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // ch0=B, ch1=G, ch2=R, ch3=A (source BGRA order) let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); @@ -811,13 +882,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], x += 8; } if x < width { - scalar::bgra64_to_rgb_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → packed u8 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha passes through (narrowed via `>> 8`). +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -826,7 +898,11 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) unsafe fn sse41_bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); @@ -835,10 +911,10 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); let r_u8 = narrow_u16x8_to_u8x8(r_u16, zero); let g_u8 = narrow_u16x8_to_u8x8(g_u16, zero); @@ -850,13 +926,14 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 x += 8; } if x < width { - scalar::bgra64_to_rgba_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGB. 8 pixels per SIMD iteration. /// B↔R swap; alpha discarded. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -865,7 +942,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8 /// 3. `rgb_out.len() >= width * 3`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -877,23 +954,24 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); let (b_u16, g_u16, r_u16, _a) = deinterleave_rgba64_8px(v0, v1, v2, v3); // Swap B↔R: store (R, G, B) write_rgb_u16_8(r_u16, g_u16, b_u16, rgb_out.as_mut_ptr().add(x * 3)); x += 8; } if x < width { - scalar::bgra64_to_rgb_u16_row(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); + scalar::bgra64_to_rgb_u16_row::(&bgra64[x * 4..], &mut rgb_out[x * 3..], width - x); } } } /// SSE4.1 Bgra64 → native-depth u16 RGBA. 8 pixels per SIMD iteration. /// B↔R swap; source alpha preserved at position 3. +/// When `BE = true` each loaded register is byte-swapped before deinterleaving. /// /// # Safety /// @@ -902,7 +980,7 @@ pub(crate) unsafe fn sse41_bgra64_to_rgb_u16_row( /// 3. `rgba_out.len() >= width * 4`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( +pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -914,17 +992,17 @@ pub(crate) unsafe fn sse41_bgra64_to_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let ptr = bgra64.as_ptr().add(x * 4); - let v0 = _mm_loadu_si128(ptr.cast()); - let v1 = _mm_loadu_si128(ptr.add(8).cast()); - let v2 = _mm_loadu_si128(ptr.add(16).cast()); - let v3 = _mm_loadu_si128(ptr.add(24).cast()); + let v0 = byteswap_if_be::(_mm_loadu_si128(ptr.cast())); + let v1 = byteswap_if_be::(_mm_loadu_si128(ptr.add(8).cast())); + let v2 = byteswap_if_be::(_mm_loadu_si128(ptr.add(16).cast())); + let v3 = byteswap_if_be::(_mm_loadu_si128(ptr.add(24).cast())); // Swap B↔R: store (R=ch2, G=ch1, B=ch0, A=ch3) let (b_u16, g_u16, r_u16, a_u16) = deinterleave_rgba64_8px(v0, v1, v2, v3); write_rgba_u16_8(r_u16, g_u16, b_u16, a_u16, rgba_out.as_mut_ptr().add(x * 4)); x += 8; } if x < width { - scalar::bgra64_to_rgba_u16_row(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); + scalar::bgra64_to_rgba_u16_row::(&bgra64[x * 4..], &mut rgba_out[x * 4..], width - x); } } } diff --git a/src/row/arch/x86_sse41/tests/packed_rgb.rs b/src/row/arch/x86_sse41/tests/packed_rgb.rs index 10f81926..e64aeba0 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb.rs @@ -243,9 +243,9 @@ fn sse41_x2rgb10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2rgb10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -263,9 +263,9 @@ fn sse41_x2rgb10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2rgb10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgba_row(&input, &mut out_sse, w); + x2rgb10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -283,9 +283,9 @@ fn sse41_x2rgb10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2rgb10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2rgb10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2rgb10_to_rgb_u16_row(&input, &mut out_sse, w); + x2rgb10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -303,9 +303,9 @@ fn sse41_x2bgr10_to_rgb_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 3]; let mut out_sse = std::vec![0u8; w * 3]; - scalar::x2bgr10_to_rgb_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -323,9 +323,9 @@ fn sse41_x2bgr10_to_rgba_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u8; w * 4]; let mut out_sse = std::vec![0u8; w * 4]; - scalar::x2bgr10_to_rgba_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgba_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgba_row(&input, &mut out_sse, w); + x2bgr10_to_rgba_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, @@ -343,9 +343,9 @@ fn sse41_x2bgr10_to_rgb_u16_matches_scalar() { let input = pseudo_random_rgba(w); let mut out_scalar = std::vec![0u16; w * 3]; let mut out_sse = std::vec![0u16; w * 3]; - scalar::x2bgr10_to_rgb_u16_row(&input, &mut out_scalar, w); + scalar::x2bgr10_to_rgb_u16_row::(&input, &mut out_scalar, w); unsafe { - x2bgr10_to_rgb_u16_row(&input, &mut out_sse, w); + x2bgr10_to_rgb_u16_row::(&input, &mut out_sse, w); } assert_eq!( out_scalar, out_sse, diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs index 319ee5f9..57c5c8b6 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs @@ -36,8 +36,8 @@ fn sse41_rgb48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0101); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=17: SIMD vs scalar mismatch" @@ -53,8 +53,8 @@ fn sse41_rgb48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xF0F0); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgb48→rgb exact-8: SIMD vs scalar mismatch" @@ -70,8 +70,8 @@ fn sse41_rgb48_to_rgb_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut simd_out = [0u8; 3]; let mut scalar_out = [0u8; 3]; - unsafe { sse41_rgb48_to_rgb_row(&src, &mut simd_out, 1) }; - scalar::rgb48_to_rgb_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgb48_to_rgb_row::(&src, &mut simd_out, 1) }; + scalar::rgb48_to_rgb_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgb48→rgb width=1: tail-only mismatch" @@ -91,8 +91,8 @@ fn sse41_rgb48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0303); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba width=17: SIMD vs scalar mismatch" @@ -112,8 +112,8 @@ fn sse41_rgb48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0505); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgb48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -133,8 +133,8 @@ fn sse41_rgb48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x0707); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgb48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgb48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgb48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgb48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgb48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -154,8 +154,8 @@ fn sse41_bgr48_to_rgb_matches_scalar_width17() { let src = make_rgb48_src(17, 0x1111); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb width=17: SIMD vs scalar mismatch" @@ -171,8 +171,8 @@ fn sse41_bgr48_to_rgb_exact8_matches_scalar() { let src = make_rgb48_src(8, 0xA1A1); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_bgr48_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::bgr48_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_bgr48_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::bgr48_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "bgr48→rgb exact-8: SIMD vs scalar mismatch" @@ -192,8 +192,8 @@ fn sse41_bgr48_to_rgba_matches_scalar_width17() { let src = make_rgb48_src(17, 0x2222); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba width=17: SIMD vs scalar mismatch" @@ -213,8 +213,8 @@ fn sse41_bgr48_to_rgb_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x3333); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgr48_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -234,8 +234,8 @@ fn sse41_bgr48_to_rgba_u16_matches_scalar_width17() { let src = make_rgb48_src(17, 0x4444); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgr48_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgr48_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgr48_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgr48_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgr48→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -255,8 +255,8 @@ fn sse41_rgba64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0xAAAA); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb width=17: SIMD vs scalar mismatch" @@ -272,8 +272,8 @@ fn sse41_rgba64_to_rgb_exact8_matches_scalar() { let src = make_rgba64_src(8, 0x0F0F); let mut simd_out = std::vec![0u8; 8 * 3]; let mut scalar_out = std::vec![0u8; 8 * 3]; - unsafe { sse41_rgba64_to_rgb_row(&src, &mut simd_out, 8) }; - scalar::rgba64_to_rgb_row(&src, &mut scalar_out, 8); + unsafe { sse41_rgba64_to_rgb_row::(&src, &mut simd_out, 8) }; + scalar::rgba64_to_rgb_row::(&src, &mut scalar_out, 8); assert_eq!( simd_out, scalar_out, "rgba64→rgb exact-8: SIMD vs scalar mismatch" @@ -293,8 +293,8 @@ fn sse41_rgba64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0xBBBB); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba width=17: SIMD vs scalar mismatch" @@ -314,8 +314,8 @@ fn sse41_rgba64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xCCCC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_rgba64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -335,8 +335,8 @@ fn sse41_rgba64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDDDD); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -352,8 +352,8 @@ fn sse41_rgba64_to_rgba_u16_width1_tail_only() { let src = [0x1234u16, 0x5678, 0x9ABC, 0xDEF0]; // R, G, B, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_rgba64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 width=1: tail-only mismatch" @@ -373,8 +373,8 @@ fn sse41_bgra64_to_rgb_matches_scalar_width17() { let src = make_rgba64_src(17, 0x1234); let mut simd_out = std::vec![0u8; 17 * 3]; let mut scalar_out = std::vec![0u8; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb width=17: SIMD vs scalar mismatch" @@ -394,8 +394,8 @@ fn sse41_bgra64_to_rgba_matches_scalar_width17() { let src = make_rgba64_src(17, 0x5678); let mut simd_out = std::vec![0u8; 17 * 4]; let mut scalar_out = std::vec![0u8; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba width=17: SIMD vs scalar mismatch" @@ -415,8 +415,8 @@ fn sse41_bgra64_to_rgb_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0x9ABC); let mut simd_out = std::vec![0u16; 17 * 3]; let mut scalar_out = std::vec![0u16; 17 * 3]; - unsafe { sse41_bgra64_to_rgb_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 width=17: SIMD vs scalar mismatch" @@ -436,8 +436,8 @@ fn sse41_bgra64_to_rgba_u16_matches_scalar_width17() { let src = make_rgba64_src(17, 0xDEF0); let mut simd_out = std::vec![0u16; 17 * 4]; let mut scalar_out = std::vec![0u16; 17 * 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 17) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 17); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 17) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 17); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=17: SIMD vs scalar mismatch" @@ -453,8 +453,8 @@ fn sse41_bgra64_to_rgba_u16_width1_tail_only() { let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; // B, G, R, A let mut simd_out = [0u16; 4]; let mut scalar_out = [0u16; 4]; - unsafe { sse41_bgra64_to_rgba_u16_row(&src, &mut simd_out, 1) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 1); + unsafe { sse41_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 1) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 1); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 width=1: tail-only mismatch" diff --git a/src/row/dispatch/packed_rgb_16bit.rs b/src/row/dispatch/packed_rgb_16bit.rs index 6e317177..6ceb854a 100644 --- a/src/row/dispatch/packed_rgb_16bit.rs +++ b/src/row/dispatch/packed_rgb_16bit.rs @@ -72,7 +72,12 @@ fn rgba64_packed_elems(width: usize) -> usize { /// Converts one row of `Rgb48` to packed u8 RGB. Each 16-bit channel is /// narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_row( + rgb48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -81,38 +86,43 @@ pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_row::(rgb48, rgb_out, width); } /// Converts one row of `Rgb48` to packed u8 RGBA. Alpha forced to `0xFF`. /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_row( + rgb48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -121,38 +131,43 @@ pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_row::(rgb48, rgba_out, width); } /// Converts one row of `Rgb48` to native-depth u16 RGB (identity copy). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -161,38 +176,43 @@ pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row(rgb48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgb_u16_row(rgb48, rgb_out, width); + scalar::rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } /// Converts one row of `Rgb48` to native-depth u16 RGBA. Alpha forced to /// `0xFFFF`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgb48.len() >= in_min, "rgb48 row too short"); @@ -201,32 +221,32 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::neon::neon_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row(rgb48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } return; }, _ => {} } } - scalar::rgb48_to_rgba_u16_row(rgb48, rgba_out, width); + scalar::rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } /// Derives 8-bit luma from one row of `Rgb48` source. Narrows to u8 RGB via @@ -234,7 +254,7 @@ pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, /// `rgb_to_luma_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_row( +pub fn rgb48_to_luma_row( rgb48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -248,7 +268,7 @@ pub fn rgb48_to_luma_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -258,7 +278,7 @@ pub fn rgb48_to_luma_row( /// the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_u16_row( +pub fn rgb48_to_luma_u16_row( rgb48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -272,7 +292,7 @@ pub fn rgb48_to_luma_u16_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -281,7 +301,7 @@ pub fn rgb48_to_luma_u16_row( /// `rgb_to_hsv_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_hsv_row( +pub fn rgb48_to_hsv_row( rgb48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -297,7 +317,7 @@ pub fn rgb48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgb48_to_rgb_row(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -308,7 +328,12 @@ pub fn rgb48_to_hsv_row( /// Converts one row of `Bgr48` to packed u8 RGB (B↔R swap, narrow via `>> 8`). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_row( + bgr48: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -317,38 +342,43 @@ pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_sim cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_row::(bgr48, rgb_out, width); } /// Converts one row of `Bgr48` to packed u8 RGBA (B↔R swap, alpha forced to /// `0xFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_row( + bgr48: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -357,38 +387,43 @@ pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_row::(bgr48, rgba_out, width); } /// Converts one row of `Bgr48` to native-depth u16 RGB (B↔R swap, values /// unchanged). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -397,38 +432,43 @@ pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row(bgr48, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgb_u16_row(bgr48, rgb_out, width); + scalar::bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } /// Converts one row of `Bgr48` to native-depth u16 RGBA (B↔R swap, alpha /// forced to `0xFFFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgb48_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgr48.len() >= in_min, "bgr48 row too short"); @@ -437,39 +477,39 @@ pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::neon::neon_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row(bgr48, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } return; }, _ => {} } } - scalar::bgr48_to_rgba_u16_row(bgr48, rgba_out, width); + scalar::bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } /// Derives 8-bit luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_row( +pub fn bgr48_to_luma_row( bgr48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -483,7 +523,7 @@ pub fn bgr48_to_luma_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -491,7 +531,7 @@ pub fn bgr48_to_luma_row( /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_u16_row( +pub fn bgr48_to_luma_u16_row( bgr48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -505,7 +545,7 @@ pub fn bgr48_to_luma_u16_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -513,7 +553,7 @@ pub fn bgr48_to_luma_u16_row( /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_hsv_row( +pub fn bgr48_to_hsv_row( bgr48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -529,7 +569,7 @@ pub fn bgr48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgr48_to_rgb_row(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -540,7 +580,12 @@ pub fn bgr48_to_hsv_row( /// Converts one row of `Rgba64` to packed u8 RGB. Source alpha is discarded; /// R/G/B narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_row( + rgba64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -549,38 +594,43 @@ pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_row::(rgba64, rgb_out, width); } /// Converts one row of `Rgba64` to packed u8 RGBA. All 4 channels narrowed via /// `>> 8`; source alpha passes through. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -589,38 +639,43 @@ pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_row::(rgba64, rgba_out, width); } /// Converts one row of `Rgba64` to native-depth u16 RGB. Source alpha /// discarded; R/G/B copied as-is. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -629,38 +684,43 @@ pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row(rgba64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgb_u16_row(rgba64, rgb_out, width); + scalar::rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } /// Converts one row of `Rgba64` to native-depth u16 RGBA (identity copy of all /// 4 channels; source alpha preserved). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn rgba64_to_rgba_u16_row( + rgba64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(rgba64.len() >= in_min, "rgba64 row too short"); @@ -669,32 +729,32 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::neon::neon_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row(rgba64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } return; }, _ => {} } } - scalar::rgba64_to_rgba_u16_row(rgba64, rgba_out, width); + scalar::rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } /// Derives 8-bit luma from one row of `Rgba64` source. Narrows to u8 RGB via @@ -702,7 +762,7 @@ pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_row( +pub fn rgba64_to_luma_row( rgba64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -716,7 +776,7 @@ pub fn rgba64_to_luma_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -725,7 +785,7 @@ pub fn rgba64_to_luma_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_u16_row( +pub fn rgba64_to_luma_u16_row( rgba64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -739,7 +799,7 @@ pub fn rgba64_to_luma_u16_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -748,7 +808,7 @@ pub fn rgba64_to_luma_u16_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_hsv_row( +pub fn rgba64_to_hsv_row( rgba64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -764,7 +824,7 @@ pub fn rgba64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgba64_to_rgb_row(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -775,7 +835,12 @@ pub fn rgba64_to_hsv_row( /// Converts one row of `Bgra64` to packed u8 RGB (B↔R swap, drop alpha, /// narrow via `>> 8`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_row( + bgra64: &[u16], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -784,39 +849,44 @@ pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_s cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_row::(bgra64, rgb_out, width); } /// Converts one row of `Bgra64` to packed u8 RGBA (B↔R swap, all 4 channels /// narrowed via `>> 8`; source alpha passes through). `use_simd = false` forces /// the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_bytes(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -825,38 +895,43 @@ pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_row::(bgra64, rgba_out, width); } /// Converts one row of `Bgra64` to native-depth u16 RGB (B↔R swap, drop alpha, /// values copied as-is). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgb_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -865,38 +940,43 @@ pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row(bgra64, rgb_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgb_u16_row(bgra64, rgb_out, width); + scalar::bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } /// Converts one row of `Bgra64` to native-depth u16 RGBA (B↔R swap; source /// alpha preserved at position 3). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { +pub fn bgra64_to_rgba_u16_row( + bgra64: &[u16], + rgba_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba64_packed_elems(width); let out_min = rgba_row_elems(width); assert!(bgra64.len() >= in_min, "bgra64 row too short"); @@ -905,32 +985,32 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::neon_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::neon::neon_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx512::avx512_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_avx2::avx2_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::x86_sse41::sse41_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; } }, all(target_arch = "wasm32", target_feature = "simd128") => { - unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row(bgra64, rgba_out, width); } + unsafe { arch::wasm_simd128::wasm_bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } return; }, _ => {} } } - scalar::bgra64_to_rgba_u16_row(bgra64, rgba_out, width); + scalar::bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } /// Derives 8-bit luma from one row of `Bgra64` source. Narrows to u8 RGB via @@ -938,7 +1018,7 @@ pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_row( +pub fn bgra64_to_luma_row( bgra64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -952,7 +1032,7 @@ pub fn bgra64_to_luma_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -961,7 +1041,7 @@ pub fn bgra64_to_luma_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_u16_row( +pub fn bgra64_to_luma_u16_row( bgra64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -975,7 +1055,7 @@ pub fn bgra64_to_luma_u16_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } @@ -984,7 +1064,7 @@ pub fn bgra64_to_luma_u16_row( /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_hsv_row( +pub fn bgra64_to_hsv_row( bgra64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1000,7 +1080,7 @@ pub fn bgra64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgra64_to_rgb_row(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } @@ -1035,7 +1115,7 @@ mod tests { // All-white Rgb48: each u16 channel = 0xFFFF; narrowed >> 8 = 0xFF. let src = solid_rgb48(4, 0xFFFF); let mut rgb = std::vec![0u8; 4 * 3]; - rgb48_to_rgb_row(&src, &mut rgb, 4, false); + rgb48_to_rgb_row::(&src, &mut rgb, 4, false); assert!( rgb.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {rgb:?}" @@ -1046,7 +1126,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_scalar_path() { let src = solid_rgb48(4, 0x1200); let mut rgba = std::vec![0u8; 4 * 4]; - rgb48_to_rgba_row(&src, &mut rgba, 4, false); + rgb48_to_rgba_row::(&src, &mut rgba, 4, false); for px in rgba.chunks(4) { assert_eq!(px[0], 0x12, "R channel"); assert_eq!(px[3], 0xFF, "alpha forced to 0xFF"); @@ -1057,7 +1137,7 @@ mod tests { fn rgb48_dispatcher_to_rgb_u16_scalar_path() { let src = solid_rgb48(4, 0xABCD); let mut rgb_u16 = std::vec![0u16; 4 * 3]; - rgb48_to_rgb_u16_row(&src, &mut rgb_u16, 4, false); + rgb48_to_rgb_u16_row::(&src, &mut rgb_u16, 4, false); assert!( rgb_u16.iter().all(|&v| v == 0xABCD), "expected identity copy" @@ -1068,7 +1148,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_u16_scalar_path() { let src = solid_rgb48(4, 0x1234); let mut rgba_u16 = std::vec![0u16; 4 * 4]; - rgb48_to_rgba_u16_row(&src, &mut rgba_u16, 4, false); + rgb48_to_rgba_u16_row::(&src, &mut rgba_u16, 4, false); for px in rgba_u16.chunks(4) { assert_eq!(px[0], 0x1234, "R channel"); assert_eq!(px[3], 0xFFFF, "alpha forced to 0xFFFF"); @@ -1081,7 +1161,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1100,7 +1180,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgb48_to_luma_u16_row( + rgb48_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1125,7 +1205,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgb48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgb48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 0, "H for pure red must be 0"); assert_eq!(s[0], 255, "S for pure red must be 255"); assert!(v[0] >= 254, "V for pure red must be near 255, got {}", v[0]); @@ -1138,7 +1218,7 @@ mod tests { // Bgr48 pixel [B=0x1100, G=0x2200, R=0x3300] → rgb [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300]; let mut rgb = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut rgb, 1, false); + bgr48_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1148,7 +1228,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_scalar_path() { let src = [0x1100u16, 0x2200, 0x3300]; let mut rgba = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut rgba, 1, false); + bgr48_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R"); assert_eq!(rgba[3], 0xFF, "alpha forced to 0xFF"); } @@ -1157,7 +1237,7 @@ mod tests { fn bgr48_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgb_u16 = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgr48_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1167,7 +1247,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgba_u16 = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgr48_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R"); assert_eq!(rgba_u16[3], 0xFFFF, "alpha forced to 0xFFFF"); } @@ -1177,7 +1257,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); // all channels = 0xFF00 let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgr48_to_luma_row( + bgr48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1196,7 +1276,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgr48_to_luma_u16_row( + bgr48_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1219,7 +1299,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgr48_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgr48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1236,7 +1316,7 @@ mod tests { // Source alpha should be dropped; R/G/B narrowed. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; // R, G, B, A let mut rgb = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut rgb, 1, false); + rgba64_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x11, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x33, "B"); @@ -1247,7 +1327,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut rgba, 1, false); + rgba64_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1255,7 +1335,7 @@ mod tests { fn rgba64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut rgb_u16 = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + rgba64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x1111, "R"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x3333, "B"); @@ -1266,7 +1346,7 @@ mod tests { // Identity copy; source alpha preserved. let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut rgba_u16 = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + rgba64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x1111, "R"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1277,7 +1357,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgba64_to_luma_row( + rgba64_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1296,7 +1376,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgba64_to_luma_u16_row( + rgba64_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1321,7 +1401,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgba64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgba64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 60, "H for pure green must be 60 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure green must be 255"); assert!( @@ -1338,7 +1418,7 @@ mod tests { // Bgra64: B=0x1100, G=0x2200, R=0x3300, A=0xDEAD → RGB [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut rgb = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut rgb, 1, false); + bgra64_to_rgb_row::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1349,7 +1429,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8; channels swapped. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut rgba, 1, false); + bgra64_to_rgba_row::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R (from position 2)"); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1358,7 +1438,7 @@ mod tests { fn bgra64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; // B, G, R, A let mut rgb_u16 = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut rgb_u16, 1, false); + bgra64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1368,7 +1448,7 @@ mod tests { fn bgra64_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; // B, G, R, A let mut rgba_u16 = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut rgba_u16, 1, false); + bgra64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1378,7 +1458,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgra64_to_luma_row( + bgra64_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1397,7 +1477,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgra64_to_luma_u16_row( + bgra64_to_luma_u16_row::( &src, &mut luma, &mut scratch, @@ -1423,7 +1503,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgra64_to_hsv_row(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgra64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1440,7 +1520,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_input() { let src = [0u16; 2]; // needs 3 for width=1 let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1448,7 +1528,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_output() { let src = [0u16; 3]; let mut out = [0u8; 2]; // needs 3 - rgb48_to_rgb_row(&src, &mut out, 1, false); + rgb48_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1456,7 +1536,7 @@ mod tests { fn rgba64_to_rgb_row_rejects_short_input() { let src = [0u16; 3]; // needs 4 for width=1 let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1, false); + rgba64_to_rgb_row::(&src, &mut out, 1, false); } #[test] @@ -1464,7 +1544,7 @@ mod tests { fn rgba64_to_rgba_row_rejects_short_output() { let src = [0u16; 4]; let mut out = [0u8; 3]; // needs 4 - rgba64_to_rgba_row(&src, &mut out, 1, false); + rgba64_to_rgba_row::(&src, &mut out, 1, false); } #[test] @@ -1473,7 +1553,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 3]; let mut luma: [u8; 0] = []; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1490,7 +1570,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 2]; // needs 3 let mut luma = [0u8; 1]; - rgb48_to_luma_row( + rgb48_to_luma_row::( &src, &mut luma, &mut scratch, @@ -1521,7 +1601,7 @@ mod tests { fn rgb48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgb48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + rgb48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1530,7 +1610,7 @@ mod tests { fn bgr48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgr48_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + bgr48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1539,7 +1619,7 @@ mod tests { fn rgba64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgba64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + rgba64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } #[cfg(target_pointer_width = "32")] @@ -1548,6 +1628,6 @@ mod tests { fn bgra64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgra64_to_rgb_row(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + bgra64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } } diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index 9b93a405..eedf8b2a 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -948,7 +948,12 @@ pub fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize, use_simd /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_row( + x2rgb10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2rgb10.len() >= in_min, "x2rgb10 row too short"); @@ -958,34 +963,34 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } /// Drops the 2-bit padding, down-shifts to 8 bits, and forces alpha @@ -993,7 +998,12 @@ pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_ /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2rgb10.len() >= rgba_min, "x2rgb10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1002,34 +1012,34 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::neon::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgba_row(x2rgb10, rgba_out, width); + scalar::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } /// Extracts each 10-bit channel into native-depth `u16` (low-bit @@ -1038,7 +1048,12 @@ pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, us /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes — match // the rest of the high-bit-depth dispatchers. @@ -1050,41 +1065,46 @@ pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::neon::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx512::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_avx2::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::x86_sse41::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2rgb10_to_rgb_u16_row(x2rgb10, rgb_out, width); + scalar::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_row`]. Channel /// positions in the source `u32` are reversed; output is still /// `R, G, B`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_row( + x2bgr10: &[u8], + rgb_out: &mut [u8], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); let rgb_min = rgb_row_bytes(width); assert!(x2bgr10.len() >= in_min, "x2bgr10 row too short"); @@ -1094,39 +1114,44 @@ pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_ cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgba_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, + use_simd: bool, +) { let rgba_min = rgba_row_bytes(width); assert!(x2bgr10.len() >= rgba_min, "x2bgr10 row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); @@ -1135,39 +1160,44 @@ pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, us cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::neon::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgba_row(x2bgr10, rgba_out, width); + scalar::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_u16_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { +pub fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, + use_simd: bool, +) { let in_min = rgba_row_bytes(width); // u16 RGB output is sized in `u16` *elements*, not bytes. let rgb_min = rgb_row_elems(width); @@ -1178,32 +1208,32 @@ pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { arch::neon::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::neon::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx512::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if avx2_available() { - unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_avx2::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } if sse41_available() { - unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::x86_sse41::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); } + unsafe { arch::wasm_simd128::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } return; } }, _ => {} } } - scalar::x2bgr10_to_rgb_u16_row(x2bgr10, rgb_out, width); + scalar::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } diff --git a/src/row/scalar/packed_rgb.rs b/src/row/scalar/packed_rgb.rs index f1c2862d..0f4091f6 100644 --- a/src/row/scalar/packed_rgb.rs +++ b/src/row/scalar/packed_rgb.rs @@ -306,12 +306,17 @@ pub(crate) fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize) { /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -330,12 +335,21 @@ pub(crate) fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2rgb10_to_rgba_row( + x2rgb10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = (pix >> 20) & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = pix & 0x3FF; @@ -355,12 +369,21 @@ pub(crate) fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2rgb10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2rgb10_to_rgb_u16_row( + x2rgb10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2rgb10.len() >= width * 4, "x2rgb10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]); + let bytes = [x2rgb10[i], x2rgb10[i + 1], x2rgb10[i + 2], x2rgb10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = ((pix >> 20) & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; @@ -377,12 +400,17 @@ pub(crate) fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -400,12 +428,21 @@ pub(crate) fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usiz /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgba_out.len() < 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize) { +pub(crate) fn x2bgr10_to_rgba_row( + x2bgr10: &[u8], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let r10 = pix & 0x3FF; let g10 = (pix >> 10) & 0x3FF; let b10 = (pix >> 20) & 0x3FF; @@ -423,12 +460,21 @@ pub(crate) fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: us /// Panics (any build profile) if `x2bgr10.len() < 4 * width` or /// `rgb_out.len() < 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize) { +pub(crate) fn x2bgr10_to_rgb_u16_row( + x2bgr10: &[u8], + rgb_out: &mut [u16], + width: usize, +) { debug_assert!(x2bgr10.len() >= width * 4, "x2bgr10 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let i = x * 4; - let pix = u32::from_le_bytes([x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]); + let bytes = [x2bgr10[i], x2bgr10[i + 1], x2bgr10[i + 2], x2bgr10[i + 3]]; + let pix = if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + }; let dst = x * 3; rgb_out[dst] = (pix & 0x3FF) as u16; rgb_out[dst + 1] = ((pix >> 10) & 0x3FF) as u16; diff --git a/src/row/scalar/packed_rgb_16bit.rs b/src/row/scalar/packed_rgb_16bit.rs index d530eaa2..e7883a79 100644 --- a/src/row/scalar/packed_rgb_16bit.rs +++ b/src/row/scalar/packed_rgb_16bit.rs @@ -1,8 +1,10 @@ //! Scalar reference kernels for 16-bit packed RGB sources (Tier 8 finish). //! -//! Input planes are `&[u16]`. Each u16 sample is the native channel value -//! (range [0, 65535]). No endian conversion — caller deserialises LE bytes -//! to `&[u16]` before constructing the frame. +//! Input planes are `&[u16]`. Each u16 sample is either LE- or BE-encoded on +//! disk/wire; the `` const-generic parameter selects the +//! interpretation. When `BE = false` (the default) the kernels behave exactly +//! as before — no extra work. When `BE = true` each u16 element is +//! byte-swapped on load via `u16::swap_bytes()` before any channel math. //! //! # Format layouts //! @@ -18,56 +20,91 @@ //! - u16 → u8: `(v >> 8) as u8` (high-byte extraction, matching Y216 / Ship 11d). //! - u16 → u16: identity copy (no scaling). +// ---- Endian load helper ------------------------------------------------------ + +/// Load one u16 element, applying a byte-swap when `BE = true`. +/// +/// The `if BE` branch is evaluated at compile time (monomorphization), so the +/// unused branch is entirely eliminated from the generated binary. +#[inline(always)] +fn load_u16(v: u16) -> u16 { + if BE { v.swap_bytes() } else { v } +} + // ---- Rgb48 family (3 u16 elements per pixel: R, G, B) ---------------------- /// Rgb48 → packed u8 RGB: narrow each 16-bit channel via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load so the channel +/// value is in host-native order before narrowing. +/// /// Input stride: `width * 3` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (rgb48[src] >> 8) as u8; - rgb_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; } } -/// Rgb48 → packed u16 RGB: identity copy (already R, G, B order). +/// Rgb48 → packed u16 RGB: copy with optional byte-swap (already R, G, B order). +/// +/// When `BE = true` each element is byte-swapped so the output contains +/// host-native u16 values. /// /// Input and output stride: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgb_u16_row( + rgb48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); - rgb_u16_out[..width * 3].copy_from_slice(&rgb48[..width * 3]); + if BE { + for i in 0..width * 3 { + rgb_u16_out[i] = rgb48[i].swap_bytes(); + } + } else { + rgb_u16_out[..width * 3].copy_from_slice(&rgb48[..width * 3]); + } } /// Rgb48 → packed u8 RGBA: narrow each 16-bit channel via `>> 8`, force alpha = 0xFF. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 3` u16 elements, output: `width * 4` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (rgb48[src] >> 8) as u8; - rgba_out[dst + 1] = (rgb48[src + 1] >> 8) as u8; - rgba_out[dst + 2] = (rgb48[src + 2] >> 8) as u8; + rgba_out[dst] = (load_u16::(rgb48[src]) >> 8) as u8; + rgba_out[dst + 1] = (load_u16::(rgb48[src + 1]) >> 8) as u8; + rgba_out[dst + 2] = (load_u16::(rgb48[src + 2]) >> 8) as u8; rgba_out[dst + 3] = 0xFF; } } -/// Rgb48 → packed u16 RGBA: copy R/G/B as-is, force alpha = 0xFFFF. +/// Rgb48 → packed u16 RGBA: copy R/G/B (with optional byte-swap), force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 3` u16 elements, output: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgb48_to_rgba_u16_row( + rgb48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgb48.len() >= width * 3, "rgb48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -76,9 +113,9 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = rgb48[src]; - rgba_u16_out[dst + 1] = rgb48[src + 1]; - rgba_u16_out[dst + 2] = rgb48[src + 2]; + rgba_u16_out[dst] = load_u16::(rgb48[src]); + rgba_u16_out[dst + 1] = load_u16::(rgb48[src + 1]); + rgba_u16_out[dst + 2] = load_u16::(rgb48[src + 2]); rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -87,54 +124,70 @@ pub(crate) fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_u16_out: &mut [u16], wid /// Bgr48 → packed u8 RGB: narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_out[dst] = (bgr48[src + 2] >> 8) as u8; // R (from B-G-R position 2) - rgb_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgr48[src] >> 8) as u8; // B (from B-G-R position 0) + rgb_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R (from B-G-R position 2) + rgb_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B (from B-G-R position 0) } } -/// Bgr48 → packed u16 RGB: copy with B↔R swap. +/// Bgr48 → packed u16 RGB: copy with B↔R swap (and optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgb_u16_row( + bgr48: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 3; - rgb_u16_out[dst] = bgr48[src + 2]; // R - rgb_u16_out[dst + 1] = bgr48[src + 1]; // G - rgb_u16_out[dst + 2] = bgr48[src]; // B + rgb_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgr48[src]); // B } } /// Bgr48 → packed u8 RGBA: narrow + B↔R swap + force alpha = 0xFF. +/// +/// When `BE = true` each u16 element is byte-swapped on load. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_out[dst] = (bgr48[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgr48[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgr48[src] >> 8) as u8; // B + rgba_out[dst] = (load_u16::(bgr48[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgr48[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgr48[src]) >> 8) as u8; // B rgba_out[dst + 3] = 0xFF; } } -/// Bgr48 → packed u16 RGBA: B↔R swap + force alpha = 0xFFFF. +/// Bgr48 → packed u16 RGBA: B↔R swap (+ optional byte-swap) + force alpha = 0xFFFF. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgr48_to_rgba_u16_row( + bgr48: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgr48.len() >= width * 3, "bgr48 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -143,9 +196,9 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid for x in 0..width { let src = x * 3; let dst = x * 4; - rgba_u16_out[dst] = bgr48[src + 2]; // R - rgba_u16_out[dst + 1] = bgr48[src + 1]; // G - rgba_u16_out[dst + 2] = bgr48[src]; // B + rgba_u16_out[dst] = load_u16::(bgr48[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgr48[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgr48[src]); // B rgba_u16_out[dst + 3] = 0xFFFF; } } @@ -154,121 +207,167 @@ pub(crate) fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_u16_out: &mut [u16], wid /// Rgba64 → packed u8 RGB: drop alpha, narrow R/G/B via `>> 8`. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input stride: `width * 4` u16 elements, output: `width * 3` bytes. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (rgba64[src] >> 8) as u8; - rgb_out[dst + 1] = (rgba64[src + 1] >> 8) as u8; - rgb_out[dst + 2] = (rgba64[src + 2] >> 8) as u8; + rgb_out[dst] = (load_u16::(rgba64[src]) >> 8) as u8; + rgb_out[dst + 1] = (load_u16::(rgba64[src + 1]) >> 8) as u8; + rgb_out[dst + 2] = (load_u16::(rgba64[src + 2]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B as-is. +/// Rgba64 → packed u16 RGB: drop alpha, copy R/G/B (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input stride: `width * 4` u16 elements, output: `width * 3` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgb_u16_row( + rgba64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = rgba64[src]; - rgb_u16_out[dst + 1] = rgba64[src + 1]; - rgb_u16_out[dst + 2] = rgba64[src + 2]; + rgb_u16_out[dst] = load_u16::(rgba64[src]); + rgb_u16_out[dst + 1] = load_u16::(rgba64[src + 1]); + rgb_u16_out[dst + 2] = load_u16::(rgba64[src + 2]); } } /// Rgba64 → packed u8 RGBA: narrow all 4 channels via `>> 8` (source alpha passes through). /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Input and output stride: `width * 4` elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn rgba64_to_rgba_row( + rgba64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let i = x * 4; - rgba_out[i] = (rgba64[i] >> 8) as u8; - rgba_out[i + 1] = (rgba64[i + 1] >> 8) as u8; - rgba_out[i + 2] = (rgba64[i + 2] >> 8) as u8; - rgba_out[i + 3] = (rgba64[i + 3] >> 8) as u8; + rgba_out[i] = (load_u16::(rgba64[i]) >> 8) as u8; + rgba_out[i + 1] = (load_u16::(rgba64[i + 1]) >> 8) as u8; + rgba_out[i + 2] = (load_u16::(rgba64[i + 2]) >> 8) as u8; + rgba_out[i + 3] = (load_u16::(rgba64[i + 3]) >> 8) as u8; } } -/// Rgba64 → packed u16 RGBA: identity copy of all 4 channels. +/// Rgba64 → packed u16 RGBA: copy all 4 channels (with optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Input and output stride: `width * 4` u16 elements. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn rgba64_to_rgba_u16_row( + rgba64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(rgba64.len() >= width * 4, "rgba64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, "rgba_u16_out row too short" ); - rgba_u16_out[..width * 4].copy_from_slice(&rgba64[..width * 4]); + if BE { + for i in 0..width * 4 { + rgba_u16_out[i] = rgba64[i].swap_bytes(); + } + } else { + rgba_u16_out[..width * 4].copy_from_slice(&rgba64[..width * 4]); + } } // ---- Bgra64 family (4 u16 elements per pixel: B, G, R, A) ------------------ /// Bgra64 → packed u8 RGB: drop alpha, narrow via `>> 8`, swap B↔R on output. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_out.len() >= width * 3, "rgb_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_out[dst] = (bgra64[src + 2] >> 8) as u8; // R (from position 2) - rgb_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G (unchanged) - rgb_out[dst + 2] = (bgra64[src] >> 8) as u8; // B (from position 0) + rgb_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R (from position 2) + rgb_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G (unchanged) + rgb_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B (from position 0) } } -/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap. +/// Bgra64 → packed u16 RGB: drop alpha, B↔R swap (+ optional byte-swap). +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B]`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgb_u16_row( + bgra64: &[u16], + rgb_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 3; - rgb_u16_out[dst] = bgra64[src + 2]; // R - rgb_u16_out[dst + 1] = bgra64[src + 1]; // G - rgb_u16_out[dst + 2] = bgra64[src]; // B + rgb_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgb_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgb_u16_out[dst + 2] = load_u16::(bgra64[src]); // B } } /// Bgra64 → packed u8 RGBA: narrow via `>> 8`, swap B↔R, pass through source alpha. /// +/// When `BE = true` each u16 element is byte-swapped on load. +/// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all narrowed `>> 8`). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize) { +pub(crate) fn bgra64_to_rgba_row( + bgra64: &[u16], + rgba_out: &mut [u8], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!(rgba_out.len() >= width * 4, "rgba_out row too short"); for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_out[dst] = (bgra64[src + 2] >> 8) as u8; // R - rgba_out[dst + 1] = (bgra64[src + 1] >> 8) as u8; // G - rgba_out[dst + 2] = (bgra64[src] >> 8) as u8; // B - rgba_out[dst + 3] = (bgra64[src + 3] >> 8) as u8; // A + rgba_out[dst] = (load_u16::(bgra64[src + 2]) >> 8) as u8; // R + rgba_out[dst + 1] = (load_u16::(bgra64[src + 1]) >> 8) as u8; // G + rgba_out[dst + 2] = (load_u16::(bgra64[src]) >> 8) as u8; // B + rgba_out[dst + 3] = (load_u16::(bgra64[src + 3]) >> 8) as u8; // A } } -/// Bgra64 → packed u16 RGBA: B↔R swap, pass through source alpha unchanged. +/// Bgra64 → packed u16 RGBA: B↔R swap (+ optional byte-swap), pass through source alpha. +/// +/// When `BE = true` each element is byte-swapped to produce host-native output. /// /// Source layout `[B, G, R, A]` → output layout `[R, G, B, A]` (all native u16). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], width: usize) { +pub(crate) fn bgra64_to_rgba_u16_row( + bgra64: &[u16], + rgba_u16_out: &mut [u16], + width: usize, +) { debug_assert!(bgra64.len() >= width * 4, "bgra64 row too short"); debug_assert!( rgba_u16_out.len() >= width * 4, @@ -277,10 +376,10 @@ pub(crate) fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_u16_out: &mut [u16], w for x in 0..width { let src = x * 4; let dst = x * 4; - rgba_u16_out[dst] = bgra64[src + 2]; // R - rgba_u16_out[dst + 1] = bgra64[src + 1]; // G - rgba_u16_out[dst + 2] = bgra64[src]; // B - rgba_u16_out[dst + 3] = bgra64[src + 3]; // A (unchanged) + rgba_u16_out[dst] = load_u16::(bgra64[src + 2]); // R + rgba_u16_out[dst + 1] = load_u16::(bgra64[src + 1]); // G + rgba_u16_out[dst + 2] = load_u16::(bgra64[src]); // B + rgba_u16_out[dst + 3] = load_u16::(bgra64[src + 3]); // A (byte-order corrected) } } @@ -297,7 +396,7 @@ mod tests { fn rgb48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u16; 3 * 4]; - rgb48_to_rgb_u16_row(&src, &mut out, 4); + rgb48_to_rgb_u16_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF, got {out:?}" @@ -309,7 +408,7 @@ mod tests { fn rgb48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 4]; let mut out = std::vec![0u8; 3 * 4]; - rgb48_to_rgb_row(&src, &mut out, 4); + rgb48_to_rgb_row::(&src, &mut out, 4); assert!( out.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {out:?}" @@ -321,7 +420,7 @@ mod tests { fn rgb48_to_rgb_narrow_known_value() { let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u8; 3]; - rgb48_to_rgb_row(&src, &mut out, 1); + rgb48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x12, "R channel"); assert_eq!(out[1], 0x56, "G channel"); assert_eq!(out[2], 0x9A, "B channel"); @@ -332,7 +431,7 @@ mod tests { fn rgb48_to_rgba_forces_alpha_0xff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u8; 4]; - rgb48_to_rgba_row(&src, &mut out, 1); + rgb48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[3], 0xFF, "alpha must be 0xFF"); assert_eq!(out[0], 0xAA, "R"); assert_eq!(out[1], 0xBB, "G"); @@ -344,7 +443,7 @@ mod tests { fn rgb48_to_rgba_u16_forces_alpha_0xffff() { let src = [0xAAAAu16, 0xBBBB, 0xCCCC]; let mut out = [0u16; 4]; - rgb48_to_rgba_u16_row(&src, &mut out, 1); + rgb48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0xAAAA, "R"); assert_eq!(out[1], 0xBBBB, "G"); assert_eq!(out[2], 0xCCCC, "B"); @@ -358,7 +457,7 @@ mod tests { fn bgr48_to_rgb_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u16; 3 * 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 3); + bgr48_to_rgb_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -367,7 +466,7 @@ mod tests { fn bgr48_to_rgb_all_white_narrow() { let src = std::vec![0xFFFFu16; 3 * 3]; let mut out = std::vec![0u8; 3 * 3]; - bgr48_to_rgb_row(&src, &mut out, 3); + bgr48_to_rgb_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -378,7 +477,7 @@ mod tests { // Source pixel in BGR order: B=0x1234, G=0x5678, R=0x9ABC let src = [0x1234u16, 0x5678, 0x9ABC]; let mut out = [0u16; 3]; - bgr48_to_rgb_u16_row(&src, &mut out, 1); + bgr48_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9ABC, "R (was at src[2])"); assert_eq!(out[1], 0x5678, "G (unchanged)"); assert_eq!(out[2], 0x1234, "B (was at src[0])"); @@ -389,7 +488,7 @@ mod tests { fn bgr48_to_rgb_channel_order_and_narrow() { let src = [0x1200u16, 0x5600, 0x9A00]; let mut out = [0u8; 3]; - bgr48_to_rgb_row(&src, &mut out, 1); + bgr48_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x9A, "R"); assert_eq!(out[1], 0x56, "G"); assert_eq!(out[2], 0x12, "B"); @@ -400,7 +499,7 @@ mod tests { fn bgr48_to_rgba_channel_order_and_alpha() { let src = [0x1100u16, 0x2200, 0x3300]; let mut out = [0u8; 4]; - bgr48_to_rgba_row(&src, &mut out, 1); + bgr48_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -412,7 +511,7 @@ mod tests { fn bgr48_to_rgba_u16_channel_order_and_alpha() { let src = [0x1111u16, 0x2222, 0x3333]; let mut out = [0u16; 4]; - bgr48_to_rgba_u16_row(&src, &mut out, 1); + bgr48_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -426,7 +525,7 @@ mod tests { fn rgba64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u16; 4 * 3]; - rgba64_to_rgba_u16_row(&src, &mut out, 3); + rgba64_to_rgba_u16_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -435,7 +534,7 @@ mod tests { fn rgba64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 3]; let mut out = std::vec![0u8; 4 * 3]; - rgba64_to_rgba_row(&src, &mut out, 3); + rgba64_to_rgba_row::(&src, &mut out, 3); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -445,7 +544,7 @@ mod tests { // R=0x1111, G=0x2222, B=0x3333, A=0xABCD let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut out = [0u16; 4]; - rgba64_to_rgba_u16_row(&src, &mut out, 1); + rgba64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -457,7 +556,7 @@ mod tests { fn rgba64_to_rgba_source_alpha_depth_converted() { let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut out = [0u8; 4]; - rgba64_to_rgba_row(&src, &mut out, 1); + rgba64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -469,7 +568,7 @@ mod tests { fn rgba64_to_rgb_drops_alpha() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - rgba64_to_rgb_row(&src, &mut out, 1); + rgba64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x11, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x33, "B"); @@ -480,7 +579,7 @@ mod tests { fn rgba64_to_rgb_u16_drops_alpha() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - rgba64_to_rgb_u16_row(&src, &mut out, 1); + rgba64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x1111, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x3333, "B"); @@ -493,7 +592,7 @@ mod tests { fn bgra64_to_rgba_u16_all_white_passthrough() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u16; 4 * 2]; - bgra64_to_rgba_u16_row(&src, &mut out, 2); + bgra64_to_rgba_u16_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFFFF), "expected all 0xFFFF"); } @@ -502,7 +601,7 @@ mod tests { fn bgra64_to_rgba_all_white_narrow() { let src = std::vec![0xFFFFu16; 4 * 2]; let mut out = std::vec![0u8; 4 * 2]; - bgra64_to_rgba_row(&src, &mut out, 2); + bgra64_to_rgba_row::(&src, &mut out, 2); assert!(out.iter().all(|&v| v == 0xFF), "expected all 0xFF"); } @@ -512,7 +611,7 @@ mod tests { // Source in BGRA order: B=0x1111, G=0x2222, R=0x3333, A=0x4444 let src = [0x1111u16, 0x2222, 0x3333, 0x4444]; let mut out = [0u16; 4]; - bgra64_to_rgba_u16_row(&src, &mut out, 1); + bgra64_to_rgba_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R (from src[2])"); assert_eq!(out[1], 0x2222, "G (unchanged)"); assert_eq!(out[2], 0x1111, "B (from src[0])"); @@ -524,7 +623,7 @@ mod tests { fn bgra64_to_rgba_channel_order_and_alpha_narrowed() { let src = [0x1100u16, 0x2200, 0x3300, 0xAB00]; let mut out = [0u8; 4]; - bgra64_to_rgba_row(&src, &mut out, 1); + bgra64_to_rgba_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -536,7 +635,7 @@ mod tests { fn bgra64_to_rgb_drops_alpha_and_swaps() { let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut out = [0u8; 3]; - bgra64_to_rgb_row(&src, &mut out, 1); + bgra64_to_rgb_row::(&src, &mut out, 1); assert_eq!(out[0], 0x33, "R"); assert_eq!(out[1], 0x22, "G"); assert_eq!(out[2], 0x11, "B"); @@ -547,7 +646,7 @@ mod tests { fn bgra64_to_rgb_u16_drops_alpha_and_swaps() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut out = [0u16; 3]; - bgra64_to_rgb_u16_row(&src, &mut out, 1); + bgra64_to_rgb_u16_row::(&src, &mut out, 1); assert_eq!(out[0], 0x3333, "R"); assert_eq!(out[1], 0x2222, "G"); assert_eq!(out[2], 0x1111, "B"); @@ -564,7 +663,7 @@ mod tests { 0x1100u16, 0x2200, 0x3300, 0x4400, 0x5500, 0x6600, 0x7700, 0x8800, 0x9900, ]; let mut out = [0u8; 9]; - rgb48_to_rgb_row(&src, &mut out, 3); + rgb48_to_rgb_row::(&src, &mut out, 3); assert_eq!(out[0], 0x11); assert_eq!(out[1], 0x22); assert_eq!(out[2], 0x33); @@ -584,7 +683,7 @@ mod tests { 0x5555, 0x6666, 0x7777, 0x8888, // pixel 1 ]; let mut out = [0u16; 8]; - rgba64_to_rgba_u16_row(&src, &mut out, 2); + rgba64_to_rgba_u16_row::(&src, &mut out, 2); assert_eq!(&out, &src, "identity copy must be byte-exact"); } @@ -598,8 +697,8 @@ mod tests { let mut rgb48_out = [0u8; 3]; let mut bgr48_out = [0u8; 3]; - rgb48_to_rgb_row(&rgb48_src, &mut rgb48_out, 1); - bgr48_to_rgb_row(&bgr48_src, &mut bgr48_out, 1); + rgb48_to_rgb_row::(&rgb48_src, &mut rgb48_out, 1); + bgr48_to_rgb_row::(&bgr48_src, &mut bgr48_out, 1); assert_eq!( rgb48_out, bgr48_out, diff --git a/src/sinker/mixed/packed_rgb_10bit.rs b/src/sinker/mixed/packed_rgb_10bit.rs index 4470663d..ea0a62f5 100644 --- a/src/sinker/mixed/packed_rgb_10bit.rs +++ b/src/sinker/mixed/packed_rgb_10bit.rs @@ -149,7 +149,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { w, h, )?; - x2rgb10_to_rgb_row(x2rgb10_in, rgb_row, w, use_simd); + x2rgb10_to_rgb_row::(x2rgb10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -177,7 +177,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { // u8 RGBA output (single-pass, dedicated kernel forces alpha). if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2rgb10_to_rgba_row(x2rgb10_in, rgba_row, w, use_simd); + x2rgb10_to_rgba_row::(x2rgb10_in, rgba_row, w, use_simd); } // u16 native RGB output (10-bit precision preserved). @@ -193,7 +193,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2rgb10_to_rgb_u16_row(x2rgb10_in, rgb_u16_row, w, use_simd); + x2rgb10_to_rgb_u16_row::(x2rgb10_in, rgb_u16_row, w, use_simd); } Ok(()) @@ -307,7 +307,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { w, h, )?; - x2bgr10_to_rgb_row(x2bgr10_in, rgb_row, w, use_simd); + x2bgr10_to_rgb_row::(x2bgr10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -334,7 +334,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2bgr10_to_rgba_row(x2bgr10_in, rgba_row, w, use_simd); + x2bgr10_to_rgba_row::(x2bgr10_in, rgba_row, w, use_simd); } if want_rgb_u16 { @@ -349,7 +349,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2bgr10_to_rgb_u16_row(x2bgr10_in, rgb_u16_row, w, use_simd); + x2bgr10_to_rgb_u16_row::(x2bgr10_in, rgb_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index 49e34d84..e7ad27cd 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -206,7 +206,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // with_luma_u16, or with_hsv is attached. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgb48_to_rgb_row(in48, rgb_row, w, use_simd); + rgb48_to_rgb_row::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -245,7 +245,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // u8 RGBA — single-pass kernel, alpha forced to 0xFF. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_row(in48, rgba_row, w, use_simd); + rgb48_to_rgba_row::(in48, rgba_row, w, use_simd); } // u16 RGB — native passthrough. @@ -257,13 +257,13 @@ impl PixelSink for MixedSinker<'_, Rgb48> { height: h, channels: 3, })?; - rgb48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + rgb48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); } // u16 RGBA — native passthrough, alpha forced to 0xFFFF. if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + rgb48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -426,7 +426,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgr48_to_rgb_row(in48, rgb_row, w, use_simd); + bgr48_to_rgb_row::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -464,7 +464,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_row(in48, rgba_row, w, use_simd); + bgr48_to_rgba_row::(in48, rgba_row, w, use_simd); } if let Some(buf) = rgb_u16.as_deref_mut() { @@ -475,12 +475,12 @@ impl PixelSink for MixedSinker<'_, Bgr48> { height: h, channels: 3, })?; - bgr48_to_rgb_u16_row(in48, &mut buf[ps * 3..end], w, use_simd); + bgr48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); } if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_u16_row(in48, rgba_u16_row, w, use_simd); + bgr48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -667,7 +667,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -675,7 +675,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); return Ok(()); } @@ -683,7 +683,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { // and Strategy A+ RGBA fan-out. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgba64_to_rgb_row(in64, rgb_row, w, use_simd); + rgba64_to_rgb_row::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -739,7 +739,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); } // ===== u16 path ===== @@ -754,7 +754,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - rgba64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + rgba64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached — derive from the // just-computed u16 RGB row (writes α=0xFFFF), then overwrite α @@ -778,7 +778,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); } Ok(()) @@ -950,7 +950,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -958,14 +958,14 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); return Ok(()); } // u8 RGB staging path. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgra64_to_rgb_row(in64, rgb_row, w, use_simd); + bgra64_to_rgb_row::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -1017,7 +1017,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); } // u16 RGB path. @@ -1031,7 +1031,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - bgra64_to_rgb_u16_row(in64, rgb_u16_row, w, use_simd); + bgra64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached. if want_rgba_u16 { @@ -1052,7 +1052,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); } Ok(()) From e1a4fe2071004089bedcd595886d008dd9a28a44 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 01:02:12 +1200 Subject: [PATCH 2/6] fix(be-tier8): make scalar BE conversion target-endian aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Tier 8 scalar BE-load helpers used `if BE { x.swap_bytes() } else { x }`, which is unconditional w.r.t. host endianness — wrong on big-endian hosts. The companion SIMD `load_endian_u16x*` / `load_endian_u32x4` helpers are target-endian aware (`#[cfg(target_endian = ...)]`), so a host-byte-order mismatch between scalar and SIMD would corrupt s390x rows and break the "SIMD matches scalar" parity property the dispatch tests rely on. Replace the swap-on-BE pattern with the target-endian-aware primitives: - `if BE { v.swap_bytes() } else { v }` → `if BE { u16::from_be(v) } else { u16::from_le(v) }` - The fast-path `copy_from_slice` else-branches in `rgb48_to_rgb_u16_row` and `rgba64_to_rgba_u16_row` are likewise replaced with a per-element `u16::from_le` loop so the LE source path is also correct on BE hosts. `from_be`/`from_le` are no-ops when the source byte order matches the host and a `swap_bytes` otherwise, mirroring the SIMD `load_le_*` / `load_be_*` semantics and keeping the scalar reference correct on every target. Note: the X2Rgb10/X2Bgr10 (u32) scalar paths in `packed_rgb.rs` already use `u32::from_be_bytes` / `u32::from_le_bytes` on raw `&[u8]` input, which are target-endian aware by definition, so no fix is needed there. Test fixtures (`byte_swap_*` / `to_be_bytes` helpers in `tests/`) are intentionally left untouched — they synthesise BE-encoded byte buffers from LE inputs and are correct as-is. Verified: - `cargo test --target aarch64-apple-darwin --lib` (2159 tests pass) - `cargo build --target x86_64-apple-darwin --tests` (0 warnings) - `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` - `cargo build --no-default-features` - `cargo fmt --check` - `cargo clippy --all-targets --all-features -- -D warnings` Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/scalar/packed_rgb_16bit.rs | 35 ++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/src/row/scalar/packed_rgb_16bit.rs b/src/row/scalar/packed_rgb_16bit.rs index e7883a79..e568c19e 100644 --- a/src/row/scalar/packed_rgb_16bit.rs +++ b/src/row/scalar/packed_rgb_16bit.rs @@ -2,9 +2,12 @@ //! //! Input planes are `&[u16]`. Each u16 sample is either LE- or BE-encoded on //! disk/wire; the `` const-generic parameter selects the -//! interpretation. When `BE = false` (the default) the kernels behave exactly -//! as before — no extra work. When `BE = true` each u16 element is -//! byte-swapped on load via `u16::swap_bytes()` before any channel math. +//! interpretation. When `BE = false` the input is LE-encoded; when `BE = true` +//! the input is BE-encoded. In both cases each element is converted to +//! host-native byte order on load via `u16::from_le` / `u16::from_be`, which +//! are no-ops when the source byte order already matches the host. This +//! mirrors the SIMD `load_endian_u16x*` helpers and keeps the scalar reference +//! correct on big-endian hosts (s390x). //! //! # Format layouts //! @@ -22,13 +25,19 @@ // ---- Endian load helper ------------------------------------------------------ -/// Load one u16 element, applying a byte-swap when `BE = true`. +/// Load one u16 element from a source whose byte order is selected by `BE`, +/// returning the value in host-native byte order. +/// +/// `u16::from_be` / `u16::from_le` are target-endian aware: each is a no-op +/// when the source byte order matches the host, and a `swap_bytes` otherwise. +/// This matches the SIMD `load_endian_u16x*` helpers and keeps the scalar +/// reference correct on big-endian hosts (s390x). /// /// The `if BE` branch is evaluated at compile time (monomorphization), so the /// unused branch is entirely eliminated from the generated binary. #[inline(always)] fn load_u16(v: u16) -> u16 { - if BE { v.swap_bytes() } else { v } + if BE { u16::from_be(v) } else { u16::from_le(v) } } // ---- Rgb48 family (3 u16 elements per pixel: R, G, B) ---------------------- @@ -68,10 +77,14 @@ pub(crate) fn rgb48_to_rgb_u16_row( debug_assert!(rgb_u16_out.len() >= width * 3, "rgb_u16_out row too short"); if BE { for i in 0..width * 3 { - rgb_u16_out[i] = rgb48[i].swap_bytes(); + rgb_u16_out[i] = u16::from_be(rgb48[i]); } } else { - rgb_u16_out[..width * 3].copy_from_slice(&rgb48[..width * 3]); + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 3 { + rgb_u16_out[i] = u16::from_le(rgb48[i]); + } } } @@ -285,10 +298,14 @@ pub(crate) fn rgba64_to_rgba_u16_row( ); if BE { for i in 0..width * 4 { - rgba_u16_out[i] = rgba64[i].swap_bytes(); + rgba_u16_out[i] = u16::from_be(rgba64[i]); } } else { - rgba_u16_out[..width * 4].copy_from_slice(&rgba64[..width * 4]); + // LE source: use the target-endian-aware load on each element so big-endian + // hosts also receive host-native u16 output. + for i in 0..width * 4 { + rgba_u16_out[i] = u16::from_le(rgba64[i]); + } } } From b4e0ac94938b09c0216253e7e30acc64bc6f57c7 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 00:42:21 +1200 Subject: [PATCH 3/6] fix(be-tier8): NEON/x86/wasm 16-bit BE != HOST_NATIVE_BE gate + wasm test :: + SIMD-width BE parity tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review on PR #87 found two findings already fixed in earlier tier PRs but not propagated to Tier 8: Finding 1 (high): per-backend `byteswap_*_if_be` helpers in `packed_rgb_16bit.rs` (NEON, SSE4.1, AVX2, AVX-512, wasm-simd128) gated the swap on `if BE { ... }` instead of `if BE != HOST_NATIVE_BE { ... }`. This corrupted output on two of the four `wire × host` quadrants (BE host with LE wire, and BE host with BE wire). Mirror PR #86's b7fb9d3 fix: - NEON: replace the local `byteswap_u16x8::` with the canonical `super::bswap_u16x8_if_be::` (already correct after PR #86). - SSE4.1 / AVX2 / AVX-512 / wasm-simd128: introduce `const HOST_NATIVE_BE: bool = cfg!(target_endian = "big")` and switch the local helpers to the `BE != HOST_NATIVE_BE` gate (consistent with the existing `gray.rs` pattern in the same crates). Finding 2 (medium): wasm-simd128 lane-order regression tests at `tests/packed_rgb_16bit.rs:302-366` were missing the `` turbofish on 8 call sites (4 wasm kernel + 4 scalar) added by the BE support PR. The wasm32 build broke as a result. Added `::` per the LE-encoded contract. Also adds 12 SIMD-width BE-vs-LE parity tests per Codex recommendation, covering Rgb48 / Bgr48 / Rgba64 / Bgra64 / X2Rgb10 / X2Bgr10 across all five backends. Buffers are constructed via `to_le_bytes` / `to_be_bytes` so semantics are host-independent (mirrors PR #86's 6924907). Widths chosen to exceed each backend's SIMD body threshold: - NEON (8 px / iter): width 17 (Rgb*/Bgr*/Rgba*/Bgra*), width 33 (X2RGB/BGR10). - SSE4.1 (8/16 px / iter): width 17 (16-bit), 33 (X2). - AVX2 (16/32 px / iter): width 33 (16-bit), 65 (X2). - AVX-512 (32 px / iter): width 65 (16-bit + X2). - wasm-simd128 (8/16 px): width 17 (16-bit), 33 (X2). X2RGB10 / X2BGR10 BE parity tests are co-located in `tests/packed_rgb_16bit.rs` rather than `tests/packed_rgb.rs` because the latter is not declared in `tests/mod.rs` for x86 / wasm backends (pre-existing dead-code condition unrelated to this fix), so adding them there would leave them uncompiled. Verified locally: - cargo test --target aarch64-apple-darwin --lib (2300 pass; +6 NEON 16-bit BE parity, NEON X2 BE parity already in tests/packed_rgb.rs module). - cargo test --target x86_64-apple-darwin --lib (3067 pass; +27 new x86 BE parity, SSE4.1 path actually executed under Rosetta). - cargo build --target x86_64-apple-darwin --tests (0 warnings). - RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests (clean — finding 2 fix). - cargo build --no-default-features (clean). - cargo fmt --check (clean). - cargo clippy --all-targets --all-features -- -D warnings (clean). - cargo check --target s390x-unknown-linux-gnu --lib (BE-host smoke). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon/packed_rgb_16bit.rs | 138 ++++---- src/row/arch/neon/tests/packed_rgb.rs | 87 +++++ src/row/arch/neon/tests/packed_rgb_16bit.rs | 224 +++++++++++++ src/row/arch/wasm_simd128/packed_rgb_16bit.rs | 24 +- .../wasm_simd128/tests/packed_rgb_16bit.rs | 311 ++++++++++++++++- src/row/arch/x86_avx2/packed_rgb_16bit.rs | 33 +- .../arch/x86_avx2/tests/packed_rgb_16bit.rs | 311 +++++++++++++++++ src/row/arch/x86_avx512/packed_rgb_16bit.rs | 33 +- .../arch/x86_avx512/tests/packed_rgb_16bit.rs | 310 +++++++++++++++++ src/row/arch/x86_sse41/packed_rgb_16bit.rs | 26 +- .../arch/x86_sse41/tests/packed_rgb_16bit.rs | 312 ++++++++++++++++++ 11 files changed, 1707 insertions(+), 102 deletions(-) diff --git a/src/row/arch/neon/packed_rgb_16bit.rs b/src/row/arch/neon/packed_rgb_16bit.rs index b2a35d01..caf0c5f7 100644 --- a/src/row/arch/neon/packed_rgb_16bit.rs +++ b/src/row/arch/neon/packed_rgb_16bit.rs @@ -23,11 +23,15 @@ //! //! ## Big-endian support //! -//! Every public kernel accepts ``. When `BE = true`, each -//! per-channel `uint16x8_t` vector produced by `vld3q_u16`/`vld4q_u16` is -//! byte-swapped via `byteswap_u16x8::` before any channel math. On LE -//! targets (all current AArch64 hardware) the helper is a no-op and emits -//! zero extra instructions. +//! Every public kernel accepts ``. Each per-channel +//! `uint16x8_t` vector produced by `vld3q_u16`/`vld4q_u16` is conditionally +//! byte-swapped via the canonical [`super::bswap_u16x8_if_be`] helper before +//! any channel math. The gate is `BE != HOST_NATIVE_BE`, so the swap fires +//! only when the wire endian differs from the host's native byte order — on +//! LE hosts (all current AArch64 hardware) reading LE data the helper is a +//! no-op and emits zero extra instructions; on BE hosts (e.g. `aarch64_be`) +//! reading LE data the swap fires to recover host-native u16 lanes for the +//! arithmetic that follows. //! //! ## Depth conversion //! @@ -41,27 +45,9 @@ use core::arch::aarch64::*; +use super::bswap_u16x8_if_be; use crate::row::scalar; -// ---- endian byte-swap helper ------------------------------------------------ - -/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. -/// -/// Implemented as `vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v)))`, -/// the same transform used inside `load_be_u16x8` in the NEON endian module. -/// -/// # Safety -/// -/// Caller must have NEON enabled. -#[inline(always)] -unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { - if BE { - unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } - } else { - v - } -} - // ============================================================================= // Rgb48 (R, G, B — 3 u16 elements per pixel) // ============================================================================= @@ -92,9 +78,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } @@ -128,9 +114,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(rgb48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -170,9 +156,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgb_u16_row( vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), uint16x8x3_t( - byteswap_u16x8::(px.0), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.2), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), ), ); x += 8; @@ -210,9 +196,9 @@ pub(crate) unsafe fn neon_rgb48_to_rgba_u16_row( vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), uint16x8x4_t( - byteswap_u16x8::(px.0), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.2), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), alpha, ), ); @@ -254,9 +240,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_row( while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R (source BGR order) let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (was at position 2) - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (was at position 0) + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // R (was at position 2) + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); // B (was at position 0) vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; } @@ -290,9 +276,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x3_t = vld3q_u16(bgr48.as_ptr().add(x * 3)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, alpha), @@ -332,9 +318,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgb_u16_row( vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), uint16x8x3_t( - byteswap_u16x8::(px.2), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.0), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), ), ); x += 8; @@ -373,9 +359,9 @@ pub(crate) unsafe fn neon_bgr48_to_rgba_u16_row( vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), uint16x8x4_t( - byteswap_u16x8::(px.2), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.0), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), alpha, ), ); @@ -416,9 +402,9 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; @@ -453,10 +439,10 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(rgba64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); - let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -497,9 +483,9 @@ pub(crate) unsafe fn neon_rgba64_to_rgb_u16_row( vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), uint16x8x3_t( - byteswap_u16x8::(px.0), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.2), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), ), ); x += 8; @@ -537,10 +523,10 @@ pub(crate) unsafe fn neon_rgba64_to_rgba_u16_row( vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), uint16x8x4_t( - byteswap_u16x8::(px.0), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.2), - byteswap_u16x8::(px.3), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.3), ), ); x += 8; @@ -581,9 +567,9 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_row( while x + 8 <= width { // px.0 = B, px.1 = G, px.2 = R, px.3 = A let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); // R (from position 2) - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); // G (unchanged) - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); // B (from position 0) + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); // R (from position 2) + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); // G (unchanged) + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); // B (from position 0) // Alpha (px.3) discarded. vst3_u8(rgb_out.as_mut_ptr().add(x * 3), uint8x8x3_t(r8, g8, b8)); x += 8; @@ -617,10 +603,10 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_row( let mut x = 0usize; while x + 8 <= width { let px: uint16x8x4_t = vld4q_u16(bgra64.as_ptr().add(x * 4)); - let r8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.2)); - let g8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.1)); - let b8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.0)); - let a8 = vshrn_n_u16::<8>(byteswap_u16x8::(px.3)); // source alpha depth-converted + let r8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.2)); + let g8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.1)); + let b8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.0)); + let a8 = vshrn_n_u16::<8>(bswap_u16x8_if_be::(px.3)); // source alpha depth-converted vst4_u8( rgba_out.as_mut_ptr().add(x * 4), uint8x8x4_t(r8, g8, b8, a8), @@ -660,9 +646,9 @@ pub(crate) unsafe fn neon_bgra64_to_rgb_u16_row( vst3q_u16( rgb_out.as_mut_ptr().add(x * 3), uint16x8x3_t( - byteswap_u16x8::(px.2), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.0), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), ), ); x += 8; @@ -702,10 +688,10 @@ pub(crate) unsafe fn neon_bgra64_to_rgba_u16_row( vst4q_u16( rgba_out.as_mut_ptr().add(x * 4), uint16x8x4_t( - byteswap_u16x8::(px.2), - byteswap_u16x8::(px.1), - byteswap_u16x8::(px.0), - byteswap_u16x8::(px.3), + bswap_u16x8_if_be::(px.2), + bswap_u16x8_if_be::(px.1), + bswap_u16x8_if_be::(px.0), + bswap_u16x8_if_be::(px.3), ), ); x += 8; diff --git a/src/row/arch/neon/tests/packed_rgb.rs b/src/row/arch/neon/tests/packed_rgb.rs index 8f597259..39e15ea2 100644 --- a/src/row/arch/neon/tests/packed_rgb.rs +++ b/src/row/arch/neon/tests/packed_rgb.rs @@ -343,3 +343,90 @@ fn x2bgr10_to_rgb_u16_neon_matches_scalar_widths() { assert_eq!(out_scalar, out_neon, "width {w}"); } } + +// ---- SIMD-level BE-vs-LE parity for X2RGB10 / X2BGR10 ------------------- +// +// The X2 SIMD bodies are LE-only (`if !BE` gate falls through to scalar for +// BE), but the parity test is still meaningful: `` exercises the SIMD +// body on LE bytes; `` exercises the scalar reference on BE bytes +// (which is where the host-independence of the byte-buffer construction +// matters). Both must produce identical output. Width 33 ensures the SIMD +// body executes (NEON does 16 px / iter). + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_x2rgb10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_x2bgr10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/neon/tests/packed_rgb_16bit.rs b/src/row/arch/neon/tests/packed_rgb_16bit.rs index c71131d7..bee374ce 100644 --- a/src/row/arch/neon/tests/packed_rgb_16bit.rs +++ b/src/row/arch/neon/tests/packed_rgb_16bit.rs @@ -321,3 +321,227 @@ fn neon_bgra64_to_rgba_u16_width1_scalar_tail_only() { "bgra64→rgba_u16 width=1: tail-only mismatch" ); } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// These probe the `bswap_u16x8_if_be` gate (`BE != HOST_NATIVE_BE`) at +// the SIMD layer. Existing tests above use `BE=false` only and never exercise +// the swap path. The fix in this commit replaces the broken +// `if BE { ... }` gate (which corrupted output on the BE host × LE wire and +// BE host × BE wire quadrants) with the canonical helper from `super::`. +// +// Buffers are constructed via `to_le_bytes` / `to_be_bytes` so semantics are +// host-independent: on every host, `le_buf` carries the intended values as +// LE-encoded bytes and `be_buf` carries the same values as BE-encoded bytes. +// Both `kernel(le_buf)` and `kernel(be_buf)` should +// therefore decode to the same intended host-native u16 values and produce +// identical RGB output. Mirrors PR #86's `87d682f` / `6924907` patterns. +// +// Width 17 = 2 × 8-lane SIMD body + 1 scalar tail, ensuring the SIMD body +// is exercised (not just the scalar tail). + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_rgb48_be_le_simd_parity_width17() { + let intended = make_rgb48_src(17, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_rgb48_to_rgb_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_rgb48_to_rgba_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_bgr48_be_le_simd_parity_width17() { + let intended = make_rgb48_src(17, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_bgr48_to_rgb_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_bgr48_to_rgba_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_rgba64_be_le_simd_parity_width17() { + let intended = make_rgba64_src(17, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_rgba64_to_rgb_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_rgba64_to_rgba_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_arch = "aarch64")] +#[cfg_attr(miri, ignore = "NEON intrinsics not supported under Miri")] +#[test] +fn neon_bgra64_be_le_simd_parity_width17() { + let intended = make_rgba64_src(17, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + neon_bgra64_to_rgb_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + neon_bgra64_to_rgba_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + neon_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + neon_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + neon_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} diff --git a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs index 4fa3fed5..5c4c24ff 100644 --- a/src/row/arch/wasm_simd128/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/packed_rgb_16bit.rs @@ -219,12 +219,30 @@ unsafe fn narrow_u16x8_to_u8x8(v: v128) -> v128 { // ---- endian byte-swap helper ------------------------------------------------- -/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// Compile-time host endianness. `true` on BE targets, `false` on LE. /// -/// Uses `u8x16_swizzle` with a compile-time mask. +/// Used by [`byteswap_if_be`] to gate the swap on `BE != HOST_NATIVE_BE`, +/// covering all four `wire × host` quadrants. Mirrors the gate established +/// in the canonical NEON `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in `v` so the returned value is in +/// **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// Uses `u8x16_swizzle` with a compile-time mask. The unused branch folds +/// at compile time since both `BE` and `HOST_NATIVE_BE` are constants. #[inline(always)] unsafe fn byteswap_if_be(v: v128) -> v128 { - if BE { + if BE != HOST_NATIVE_BE { // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] u8x16_swizzle( v, diff --git a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs index 40b1e770..183e036d 100644 --- a/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs +++ b/src/row/arch/wasm_simd128/tests/packed_rgb_16bit.rs @@ -299,8 +299,8 @@ fn wasm_rgba64_to_rgba_u16_lane_order_regression() { let src = make_rgba64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 4]; let mut scalar_out = std::vec![0u16; 9 * 4]; - unsafe { wasm_rgba64_to_rgba_u16_row(&src, &mut simd_out, 9) }; - scalar::rgba64_to_rgba_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_rgba64_to_rgba_u16_row::(&src, &mut simd_out, 9) }; + scalar::rgba64_to_rgba_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "rgba64→rgba_u16 lane order: SIMD vs scalar mismatch (channel mixing?)" @@ -321,8 +321,8 @@ fn wasm_rgba64_to_rgb_u16_lane_order_regression() { let src = make_rgba64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 3]; let mut scalar_out = std::vec![0u16; 9 * 3]; - unsafe { wasm_rgba64_to_rgb_u16_row(&src, &mut simd_out, 9) }; - scalar::rgba64_to_rgb_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_rgba64_to_rgb_u16_row::(&src, &mut simd_out, 9) }; + scalar::rgba64_to_rgb_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "rgba64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -341,8 +341,8 @@ fn wasm_bgra64_to_rgba_u16_lane_order_regression() { let src = make_bgra64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 4]; let mut scalar_out = std::vec![0u16; 9 * 4]; - unsafe { wasm_bgra64_to_rgba_u16_row(&src, &mut simd_out, 9) }; - scalar::bgra64_to_rgba_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_bgra64_to_rgba_u16_row::(&src, &mut simd_out, 9) }; + scalar::bgra64_to_rgba_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "bgra64→rgba_u16 lane order: SIMD vs scalar mismatch (B↔R swap or alpha?)" @@ -362,8 +362,8 @@ fn wasm_bgra64_to_rgb_u16_lane_order_regression() { let src = make_bgra64_lane_order(9); let mut simd_out = std::vec![0u16; 9 * 3]; let mut scalar_out = std::vec![0u16; 9 * 3]; - unsafe { wasm_bgra64_to_rgb_u16_row(&src, &mut simd_out, 9) }; - scalar::bgra64_to_rgb_u16_row(&src, &mut scalar_out, 9); + unsafe { wasm_bgra64_to_rgb_u16_row::(&src, &mut simd_out, 9) }; + scalar::bgra64_to_rgb_u16_row::(&src, &mut scalar_out, 9); assert_eq!( simd_out, scalar_out, "bgra64→rgb_u16 lane order: SIMD vs scalar mismatch" @@ -374,3 +374,298 @@ fn wasm_bgra64_to_rgb_u16_lane_order_regression() { assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 17 = 2 × 8-lane wasm-simd128 SIMD body + 1 scalar tail. + +#[cfg(target_feature = "simd128")] +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_rgb48_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 3, 0xACE1_DEAD_BEEF_0001); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_rgb48_to_rgb_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_rgb48_to_rgba_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_bgr48_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 3, 0xBEEF_C0DE_FACE_0002); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_bgr48_to_rgb_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_bgr48_to_rgba_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_rgba64_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 4, 0xCAFE_F00D_BABE_0003); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_rgba64_to_rgb_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_rgba64_to_rgba_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_bgra64_be_le_simd_parity_width17() { + let intended = pseudo_random_u16(17 * 4, 0xFEED_BEEF_FACE_0004); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + wasm_bgra64_to_rgb_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + wasm_bgra64_to_rgba_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + wasm_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + wasm_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + wasm_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in `tests/packed_rgb.rs` which is not +// declared in `tests/mod.rs`) so they are actually compiled and run. + +#[cfg(target_feature = "simd128")] +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +#[cfg(target_feature = "simd128")] +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_x2rgb10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[cfg(target_feature = "simd128")] +#[test] +fn wasm_x2bgr10_be_le_simd_parity_width33() { + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_avx2/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/packed_rgb_16bit.rs index db9343e2..0bfa9703 100644 --- a/src/row/arch/x86_avx2/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/packed_rgb_16bit.rs @@ -299,12 +299,22 @@ unsafe fn narrow_u16x16_to_u8x16(v: __m256i, zero: __m256i) -> __m128i { // ---- endian byte-swap helpers ----------------------------------------------- -/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// Compile-time host endianness. `true` on BE targets, `false` on LE. /// -/// Uses `_mm_shuffle_epi8` (SSSE3 subset of AVX2). +/// Used by the byte-swap helpers below to gate the swap on +/// `BE != HOST_NATIVE_BE`, covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in a `__m128i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE` — see [`byteswap256_if_be`] for the +/// full truth table. Uses `_mm_shuffle_epi8` (SSSE3 subset of AVX2). #[inline(always)] unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { - if BE { + if BE != HOST_NATIVE_BE { const MASK: __m128i = unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; unsafe { _mm_shuffle_epi8(v, MASK) } @@ -313,12 +323,23 @@ unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { } } -/// Byte-swap every u16 lane in a `__m256i` when `BE = true`; no-op otherwise. +/// Conditionally byte-swap every u16 lane in a `__m256i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | /// -/// Uses `_mm256_shuffle_epi8` (AVX2). +/// Uses `_mm256_shuffle_epi8` (AVX2). The unused branch folds at compile +/// time since both `BE` and `HOST_NATIVE_BE` are constants. #[inline(always)] unsafe fn byteswap256_if_be(v: __m256i) -> __m256i { - if BE { + if BE != HOST_NATIVE_BE { // Same u16-lane byte-swap mask, broadcast to both 128-bit lanes. const MASK: __m256i = unsafe { core::mem::transmute([ diff --git a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs index 9dceec81..1905f3c2 100644 --- a/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx2/tests/packed_rgb_16bit.rs @@ -781,3 +781,314 @@ fn avx2_bgra64_to_rgb_u16_lane_order_handcheck() { assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 33 = 2 × 16-lane AVX2 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_rgb48_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgb48_src(33, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_rgb48_to_rgb_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_rgb48_to_rgba_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_rgb48_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_rgb48_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_rgb48_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_bgr48_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgb48_src(33, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_bgr48_to_rgb_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_bgr48_to_rgba_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_bgr48_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_bgr48_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_bgr48_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_rgba64_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgba64_src(33, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_rgba64_to_rgb_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_rgba64_to_rgba_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_rgba64_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_rgba64_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_rgba64_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_bgra64_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = make_rgba64_src(33, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + avx2_bgra64_to_rgb_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + avx2_bgra64_to_rgba_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + avx2_bgra64_to_rgb_u16_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 33 * 4]; + let mut out_be = std::vec![0u16; 33 * 4]; + unsafe { + avx2_bgra64_to_rgba_u16_row::(&le, &mut out_le, 33); + avx2_bgra64_to_rgba_u16_row::(&be, &mut out_be, 33); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. +// Width 65 = 2 × 32-lane AVX2 SIMD body + 1 scalar tail. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_x2rgb10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 65); + x2rgb10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx2_x2bgr10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 65); + x2bgr10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_avx512/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/packed_rgb_16bit.rs index 3b000cbb..b1f16348 100644 --- a/src/row/arch/x86_avx512/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/packed_rgb_16bit.rs @@ -242,12 +242,22 @@ unsafe fn narrow_u16x32_to_u8x32(v: __m512i) -> __m256i { // ---- endian byte-swap helpers ----------------------------------------------- -/// Byte-swap every u16 lane in a `__m128i` when `BE = true`; no-op otherwise. +/// Compile-time host endianness. `true` on BE targets, `false` on LE. /// -/// Uses `_mm_shuffle_epi8` (SSSE3, a subset of AVX-512). +/// Used by the byte-swap helpers below to gate the swap on +/// `BE != HOST_NATIVE_BE`, covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in a `__m128i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE` — see [`byteswap512_if_be`] for the +/// full truth table. Uses `_mm_shuffle_epi8` (SSSE3, a subset of AVX-512). #[inline(always)] unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { - if BE { + if BE != HOST_NATIVE_BE { const MASK: __m128i = unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; unsafe { _mm_shuffle_epi8(v, MASK) } @@ -256,12 +266,23 @@ unsafe fn byteswap128_if_be(v: __m128i) -> __m128i { } } -/// Byte-swap every u16 lane in a `__m512i` when `BE = true`; no-op otherwise. +/// Conditionally byte-swap every u16 lane in a `__m512i` so the returned +/// value is in **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | /// -/// Uses `_mm512_shuffle_epi8` (AVX-512BW). +/// Uses `_mm512_shuffle_epi8` (AVX-512BW). The unused branch folds at +/// compile time since both `BE` and `HOST_NATIVE_BE` are constants. #[inline(always)] unsafe fn byteswap512_if_be(v: __m512i) -> __m512i { - if BE { + if BE != HOST_NATIVE_BE { // Same u16-lane byte-swap mask, broadcast across all 64 bytes. const MASK: __m512i = unsafe { core::mem::transmute([ diff --git a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs index 4ae0709b..ab4afac3 100644 --- a/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_avx512/tests/packed_rgb_16bit.rs @@ -779,3 +779,313 @@ fn avx512_bgra64_to_rgb_u16_lane_order_handcheck() { assert_eq!(simd_out[n * 3 + 2], (n as u16) + 200, "B at pixel {n}"); } } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 65 = 2 × 32-lane AVX-512 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_rgb48_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgb48_src(65, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_rgb48_to_rgb_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_rgb48_to_rgba_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_rgb48_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_rgb48_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_rgb48_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_bgr48_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgb48_src(65, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_bgr48_to_rgb_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_bgr48_to_rgba_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_bgr48_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_bgr48_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_bgr48_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_rgba64_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgba64_src(65, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_rgba64_to_rgb_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_rgba64_to_rgba_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_rgba64_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_rgba64_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_rgba64_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_bgra64_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = make_rgba64_src(65, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + avx512_bgra64_to_rgb_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + avx512_bgra64_to_rgba_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + avx512_bgra64_to_rgb_u16_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 65 * 4]; + let mut out_be = std::vec![0u16; 65 * 4]; + unsafe { + avx512_bgra64_to_rgba_u16_row::(&le, &mut out_le, 65); + avx512_bgra64_to_rgba_u16_row::(&be, &mut out_be, 65); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_x2rgb10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 65); + x2rgb10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn avx512_x2bgr10_be_le_simd_parity_width65() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + let intended = pseudo_random_x2_intended(65, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 65 * 3]; + let mut out_be = std::vec![0u8; 65 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 65 * 4]; + let mut out_be = std::vec![0u8; 65 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 65); + x2bgr10_to_rgba_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 65 * 3]; + let mut out_be = std::vec![0u16; 65 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 65); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 65); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} diff --git a/src/row/arch/x86_sse41/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/packed_rgb_16bit.rs index c9a8bff4..486279c1 100644 --- a/src/row/arch/x86_sse41/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/packed_rgb_16bit.rs @@ -308,13 +308,33 @@ unsafe fn narrow_u16x8_to_u8x8(v: __m128i, zero: __m128i) -> __m128i { // ---- endian byte-swap helper ------------------------------------------------ -/// Byte-swap every u16 lane in `v` when `BE = true`; no-op otherwise. +/// Compile-time host endianness. `true` on BE targets, `false` on LE. +/// +/// Used by [`byteswap_if_be`] to gate the byte-swap on `BE != HOST_NATIVE_BE` +/// so the swap fires only when the wire endian differs from the host's +/// native byte order — covering all four `wire × host` quadrants. Mirrors +/// the gate established in `gray.rs` and the canonical NEON +/// `bswap_u16x8_if_be` helper. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap every u16 lane in `v` so the returned value is in +/// **host-native** byte order regardless of the host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | /// /// Uses `_mm_shuffle_epi8` (SSSE3, a subset of SSE4.1) with the same mask as -/// `endian::BYTESWAP_MASK_U16`. +/// `endian::BYTESWAP_MASK_U16`. The unused branch folds at compile time +/// since `BE` and `HOST_NATIVE_BE` are both compile-time constants. #[inline(always)] unsafe fn byteswap_if_be(v: __m128i) -> __m128i { - if BE { + if BE != HOST_NATIVE_BE { // Swap bytes within each u16 lane: [1,0, 3,2, 5,4, 7,6, 9,8, 11,10, 13,12, 15,14] const MASK: __m128i = unsafe { core::mem::transmute([1u8, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14]) }; diff --git a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs index 57c5c8b6..34553ada 100644 --- a/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs +++ b/src/row/arch/x86_sse41/tests/packed_rgb_16bit.rs @@ -460,3 +460,315 @@ fn sse41_bgra64_to_rgba_u16_width1_tail_only() { "bgra64→rgba_u16 width=1: tail-only mismatch" ); } + +// ============================================================================= +// SIMD-level BE-vs-LE parity tests (probes `BE != HOST_NATIVE_BE` gate) +// ============================================================================= +// +// Buffers built host-independently via `to_le_bytes` / `to_be_bytes`. Width +// 17 = 2 × 8-lane SSE4.1 SIMD body + 1 scalar tail. + +fn make_le_be_pair_u16(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_rgb48_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgb48_src(17, 0xACE1); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_rgb48_to_rgb_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_rgb48_to_rgba_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgb48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_rgb48_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_rgb48_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_rgb48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgb48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_bgr48_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgb48_src(17, 0xBEEF); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_bgr48_to_rgb_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_bgr48_to_rgba_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgr48→rgba SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_bgr48_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_bgr48_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_bgr48_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgr48→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_rgba64_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgba64_src(17, 0xCAFE); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_rgba64_to_rgb_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "rgba64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_rgba64_to_rgba_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_rgba64_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_rgba64_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_rgba64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "rgba64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_bgra64_be_le_simd_parity_width17() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = make_rgba64_src(17, 0xF00D); + let (le, be) = make_le_be_pair_u16(&intended); + + let mut out_le = std::vec![0u8; 17 * 3]; + let mut out_be = std::vec![0u8; 17 * 3]; + unsafe { + sse41_bgra64_to_rgb_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgb_row::(&be, &mut out_be, 17); + } + assert_eq!(out_le, out_be, "bgra64→rgb SIMD BE/LE parity (endian gate)"); + + let mut out_le = std::vec![0u8; 17 * 4]; + let mut out_be = std::vec![0u8; 17 * 4]; + unsafe { + sse41_bgra64_to_rgba_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgba_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 3]; + let mut out_be = std::vec![0u16; 17 * 3]; + unsafe { + sse41_bgra64_to_rgb_u16_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgb_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgb_u16 SIMD BE/LE parity (endian gate)" + ); + + let mut out_le = std::vec![0u16; 17 * 4]; + let mut out_be = std::vec![0u16; 17 * 4]; + unsafe { + sse41_bgra64_to_rgba_u16_row::(&le, &mut out_le, 17); + sse41_bgra64_to_rgba_u16_row::(&be, &mut out_be, 17); + } + assert_eq!( + out_le, out_be, + "bgra64→rgba_u16 SIMD BE/LE parity (endian gate)" + ); +} + +// ============================================================================= +// X2RGB10 / X2BGR10 SIMD-level BE-vs-LE parity tests +// ============================================================================= +// +// Co-located here (rather than in the dead-code `tests/packed_rgb.rs` which +// is not declared in `tests/mod.rs`) so they are actually compiled and run. +// Width 33 = 2 × 16-lane SSE4.1 SIMD body + 1 scalar tail (u8 outputs); +// the u16 output kernel uses 8 px / iter, so 33 = 4 × 8 + 1. + +fn pseudo_random_x2_intended(width: usize, seed: u32) -> std::vec::Vec { + let mut state = seed; + (0..width) + .map(|_| { + state = state.wrapping_mul(1_664_525).wrapping_add(1_013_904_223); + state + }) + .collect() +} + +fn make_le_be_pair_x2(intended: &[u32]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_x2rgb10_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = pseudo_random_x2_intended(33, 0xC0DE_BEEF); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2rgb10_to_rgb_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2rgb10_to_rgba_row::(&le, &mut out_le, 33); + x2rgb10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2rgb10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2rgb10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2rgb10→rgb_u16 SIMD BE/LE parity"); +} + +#[test] +#[cfg_attr(miri, ignore = "x86 SIMD intrinsics unsupported by Miri")] +fn sse41_x2bgr10_be_le_simd_parity_width33() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let intended = pseudo_random_x2_intended(33, 0xFEED_FACE); + let (le, be) = make_le_be_pair_x2(&intended); + + let mut out_le = std::vec![0u8; 33 * 3]; + let mut out_be = std::vec![0u8; 33 * 3]; + unsafe { + x2bgr10_to_rgb_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb SIMD BE/LE parity"); + + let mut out_le = std::vec![0u8; 33 * 4]; + let mut out_be = std::vec![0u8; 33 * 4]; + unsafe { + x2bgr10_to_rgba_row::(&le, &mut out_le, 33); + x2bgr10_to_rgba_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgba SIMD BE/LE parity"); + + let mut out_le = std::vec![0u16; 33 * 3]; + let mut out_be = std::vec![0u16; 33 * 3]; + unsafe { + x2bgr10_to_rgb_u16_row::(&le, &mut out_le, 33); + x2bgr10_to_rgb_u16_row::(&be, &mut out_be, 33); + } + assert_eq!(out_le, out_be, "x2bgr10→rgb_u16 SIMD BE/LE parity"); +} From 177a233f6d5bd334b65bf44f8eb95985331c2c76 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 01:06:03 +1200 Subject: [PATCH 4/6] fix(be-tier8): frame docs match LE-encoded contract + public row API compat wrappers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex 2nd-pass review of PR #87 surfaced two [high] findings: 1. Frame docs contradicted the LE-encoded plane contract. `src/frame/packed_rgb_16bit.rs` told big-endian callers to pre-normalize each `u16` via `u16::from_le` before constructing Rgb48/Bgr48/Rgba64/ Bgra64 frames. After PR #92 (`5b42065` / `3b1d716`) the documented contract is the opposite: the plane is the **LE-encoded byte layout** reinterpreted as `&[u16]` (matching FFmpeg's `*LE` suffix), and the downstream row kernel applies `u16::from_le` itself (no-op on LE host, byte-swap on BE host). A BE caller following the old docs would pre-swap → kernel swaps again → double swap → corrupted output on every BE host. Doc-comments on `Rgb48Frame` / `Bgr48Frame` / `Rgba64Frame` / `Bgra64Frame` now state the LE-encoded byte contract explicitly, mirroring the wording PR #92 used for `Rgbf32Frame` / `Gbrpf32Frame` / `Gbrapf32Frame` etc. The module-level header is updated to match. Adds an Rgb48 sinker BE-contract regression test (`rgb48_sinker_le_encoded_frame_decodes_correctly`) following the `rgbf32_sinker_le_encoded_frame_decodes_correctly` pattern from PR #92: builds the plane from LE-encoded `u16` patterns (`intended.to_le()`), forces `with_simd(false)` so it runs purely scalar, and asserts the `with_rgb_u16` identity-passthrough output bit-equals `intended`. On a BE host with a regressed pre-swap this would byte-swap every sample. The test runs under miri, which is exactly where BE CI surfaces. 2. Public row APIs broke backwards compatibility. PR #87 added `` to 28 public functions in `src/row/dispatch/packed_rgb_16bit.rs` (Rgb48/Bgr48/Rgba64/Bgra64 × {rgb, rgba, rgb_u16, rgba_u16, luma, luma_u16, hsv}) and 6 public functions in `src/row/dispatch/rgb_ops.rs` (X2Rgb10/X2Bgr10 × {rgb, rgba, rgb_u16}). All 34 functions are re-exported from `crate::row::*` (16-bit ones are explicit, x2 ones via `rgb_ops::*` glob). Existing downstream LE-only callers cannot compile against the new const-generic signature without adding `::` turbofish to every call site — a hard breaking change. Each function is renamed to `foo_endian` and a thin non-generic LE-only wrapper `foo` is added that forwards to `foo_endian::`. Existing pre-Tier 8 call sites compile unchanged; sinker code is updated to call the explicit `_endian::` form so endian intent is visible at every internal callsite. Audit / fan-out: - 28 packed-16-bit dispatchers renamed + 28 LE wrappers added - 6 X2Rgb10/X2Bgr10 dispatchers renamed + 6 LE wrappers added - `src/row/mod.rs` re-exports both `foo` and `foo_endian` for the 16 pub `→{rgb,rgba,rgb_u16,rgba_u16}` entries and pub(crate) for the 12 `→{luma,luma_u16,hsv}` entries - 12 internal turbofish call sites updated to `_endian::<…>` across `src/sinker/mixed/packed_rgb_16bit.rs`, `src/sinker/mixed/packed_rgb_10bit.rs`, and the in-file dispatcher unit tests in `src/row/dispatch/packed_rgb_16bit.rs` The wrappers carry `#[allow(clippy::too_many_arguments)]` only when the underlying `_endian` form already does (luma/luma_u16/hsv variants); this matches the pre-existing pattern on the BE-aware definitions and is not a new suppression. Verified: cargo test --target aarch64-apple-darwin --lib cargo build --target x86_64-apple-darwin --tests # 0 warnings RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests cargo build --no-default-features cargo fmt --check cargo clippy --all-targets --all-features -- -D warnings cargo check --target s390x-unknown-linux-gnu --lib # BE-host smoke Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/packed_rgb_16bit.rs | 73 ++- src/row/dispatch/packed_rgb_16bit.rs | 552 ++++++++++++++++++--- src/row/dispatch/rgb_ops.rs | 60 ++- src/row/mod.rs | 30 +- src/sinker/mixed/packed_rgb_10bit.rs | 17 +- src/sinker/mixed/packed_rgb_16bit.rs | 53 +- src/sinker/mixed/tests/packed_rgb_16bit.rs | 61 +++ 7 files changed, 708 insertions(+), 138 deletions(-) diff --git a/src/frame/packed_rgb_16bit.rs b/src/frame/packed_rgb_16bit.rs index e06607ca..492546e2 100644 --- a/src/frame/packed_rgb_16bit.rs +++ b/src/frame/packed_rgb_16bit.rs @@ -4,13 +4,21 @@ //! - `AV_PIX_FMT_RGBA64LE` → [`Rgba64Frame`] (R, G, B, A; stride in u16 elements ≥ 4 × width) //! - `AV_PIX_FMT_BGRA64LE` → [`Bgra64Frame`] (B, G, R, A; stride in u16 elements ≥ 4 × width) //! -//! Stride is in **u16 elements** (not bytes). Plane slice is `&[u16]`. -//! Callers with a raw FFmpeg byte buffer should cast via `bytemuck::cast_slice` -//! (which checks alignment at runtime) and divide `linesize[0]` by 2. Direct -//! pointer casts to `&[u16]` are undefined behaviour if the byte buffer is not -//! 2-byte aligned, and produce wrong values on big-endian hosts — all FFmpeg -//! `*LE` formats store samples little-endian, so big-endian targets would also -//! need per-sample `u16::from_le` conversion. +//! # Endian contract — **LE-encoded bytes** +//! +//! The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +//! `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +//! On a little-endian host (every CI runner today) LE bytes _are_ host-native, +//! so `&[u16]` is also a host-native u16 slice; on a big-endian host the bytes +//! have to be byte-swapped back to host-native before arithmetic. Downstream +//! row kernels handle this byte-swap (or no-op on LE) under the hood — +//! callers do **not** pre-swap. +//! +//! Stride is in **u16 elements** (not bytes). Callers holding a raw FFmpeg +//! byte buffer should cast via `bytemuck::cast_slice` (which checks alignment +//! at runtime) and divide `linesize[0]` by 2 before constructing. Direct +//! pointer casts to `&[u16]` are undefined behaviour if the byte buffer is +//! not 2-byte aligned. use derive_more::IsVariant; use thiserror::Error; @@ -62,10 +70,19 @@ pub enum Rgb48FrameError { } /// A validated packed **RGB48** frame (`AV_PIX_FMT_RGB48LE`) — three `u16` -/// samples per pixel in `R, G, B` order. Each `u16` is a native little-endian -/// sample; the caller is responsible for casting the raw FFmpeg byte buffer. +/// samples per pixel in `R, G, B` order. +/// +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. /// -/// `stride` is in **u16 elements** (≥ `3 * width`). +/// `stride` is in **u16 elements** (≥ `3 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgb48Frame<'a> { rgb48: &'a [u16], @@ -197,7 +214,17 @@ pub enum Bgr48FrameError { /// samples per pixel in `B, G, R` order. Channel order is reversed relative /// to [`Rgb48Frame`]; stride convention and element type are identical. /// -/// `stride` is in **u16 elements** (≥ `3 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `3 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Bgr48Frame<'a> { bgr48: &'a [u16], @@ -329,7 +356,17 @@ pub enum Rgba64FrameError { /// samples per pixel in `R, G, B, A` order. The alpha channel is real /// (not padding) and is passed through by `with_rgba` / `with_rgba_u16`. /// -/// `stride` is in **u16 elements** (≥ `4 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `4 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Rgba64Frame<'a> { rgba64: &'a [u16], @@ -462,7 +499,17 @@ pub enum Bgra64FrameError { /// first three elements relative to [`Rgba64Frame`]; alpha at position 3 is /// real and is passed through by `with_rgba` / `with_rgba_u16`. /// -/// `stride` is in **u16 elements** (≥ `4 * width`). +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16`, matching the FFmpeg `*LE` pixel-format suffix in the format name. +/// On a little-endian host (every CI runner today) LE bytes _are_ host-native, +/// so `&[u16]` is also a host-native u16 slice; on a big-endian host the +/// bytes have to be byte-swapped back to host-native before arithmetic. +/// Downstream row kernels handle this byte-swap (or no-op on LE) under the +/// hood — callers do **not** pre-swap. +/// +/// `stride` is in **u16 elements** (≥ `4 * width`). Callers holding byte +/// buffers from FFmpeg should cast via `bytemuck::cast_slice` and divide +/// `linesize[0]` by 2 before constructing. #[derive(Debug, Clone, Copy)] pub struct Bgra64Frame<'a> { bgra64: &'a [u16], diff --git a/src/row/dispatch/packed_rgb_16bit.rs b/src/row/dispatch/packed_rgb_16bit.rs index 6ceb854a..d3944dbf 100644 --- a/src/row/dispatch/packed_rgb_16bit.rs +++ b/src/row/dispatch/packed_rgb_16bit.rs @@ -72,7 +72,7 @@ fn rgba64_packed_elems(width: usize) -> usize { /// Converts one row of `Rgb48` to packed u8 RGB. Each 16-bit channel is /// narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_row( +pub fn rgb48_to_rgb_row_endian( rgb48: &[u16], rgb_out: &mut [u8], width: usize, @@ -114,10 +114,18 @@ pub fn rgb48_to_rgb_row( scalar::rgb48_to_rgb_row::(rgb48, rgb_out, width); } +/// LE-only wrapper around [`rgb48_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgb_row(rgb48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + rgb48_to_rgb_row_endian::(rgb48, rgb_out, width, use_simd) +} + /// Converts one row of `Rgb48` to packed u8 RGBA. Alpha forced to `0xFF`. /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_row( +pub fn rgb48_to_rgba_row_endian( rgb48: &[u16], rgba_out: &mut [u8], width: usize, @@ -159,10 +167,18 @@ pub fn rgb48_to_rgba_row( scalar::rgb48_to_rgba_row::(rgb48, rgba_out, width); } +/// LE-only wrapper around [`rgb48_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgba_row(rgb48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + rgb48_to_rgba_row_endian::(rgb48, rgba_out, width, use_simd) +} + /// Converts one row of `Rgb48` to native-depth u16 RGB (identity copy). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgb_u16_row( +pub fn rgb48_to_rgb_u16_row_endian( rgb48: &[u16], rgb_out: &mut [u16], width: usize, @@ -204,10 +220,18 @@ pub fn rgb48_to_rgb_u16_row( scalar::rgb48_to_rgb_u16_row::(rgb48, rgb_out, width); } +/// LE-only wrapper around [`rgb48_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgb_u16_row(rgb48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + rgb48_to_rgb_u16_row_endian::(rgb48, rgb_out, width, use_simd) +} + /// Converts one row of `Rgb48` to native-depth u16 RGBA. Alpha forced to /// `0xFFFF`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb48_to_rgba_u16_row( +pub fn rgb48_to_rgba_u16_row_endian( rgb48: &[u16], rgba_out: &mut [u16], width: usize, @@ -249,12 +273,20 @@ pub fn rgb48_to_rgba_u16_row( scalar::rgb48_to_rgba_u16_row::(rgb48, rgba_out, width); } +/// LE-only wrapper around [`rgb48_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_rgba_u16_row(rgb48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + rgb48_to_rgba_u16_row_endian::(rgb48, rgba_out, width, use_simd) +} + /// Derives 8-bit luma from one row of `Rgb48` source. Narrows to u8 RGB via /// `rgb48_to_rgb_row` into `rgb_scratch` (length ≥ `width × 3`), then applies /// `rgb_to_luma_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_row( +pub fn rgb48_to_luma_row_endian( rgb48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -268,17 +300,42 @@ pub fn rgb48_to_luma_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgb48_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_luma_row( + rgb48: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgb48_to_luma_row_endian::( + rgb48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Rgb48` source (Y' is computed at 8-bit /// precision and zero-extended). Narrows to u8 RGB via `rgb48_to_rgb_row` into /// `rgb_scratch`, then applies `rgb_to_luma_u16_row`. `use_simd = false` forces /// the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_luma_u16_row( +pub fn rgb48_to_luma_u16_row_endian( rgb48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -292,16 +349,41 @@ pub fn rgb48_to_luma_u16_row( assert!(rgb48.len() >= in_min, "rgb48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgb48_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_luma_u16_row( + rgb48: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgb48_to_luma_u16_row_endian::( + rgb48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Rgb48` source (OpenCV 8-bit encoding). /// Narrows to u8 RGB via `rgb48_to_rgb_row` into `rgb_scratch`, then applies /// `rgb_to_hsv_row`. `use_simd = false` forces the scalar path for both steps. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgb48_to_hsv_row( +pub fn rgb48_to_hsv_row_endian( rgb48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -317,10 +399,27 @@ pub fn rgb48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgb48_to_rgb_row::(rgb48, rgb_scratch, width, use_simd); + rgb48_to_rgb_row_endian::(rgb48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`rgb48_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgb48_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb48_to_hsv_row( + rgb48: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + rgb48_to_hsv_row_endian::(rgb48, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Bgr48 (B, G, R — 3 u16 elements per pixel) // ============================================================================= @@ -328,7 +427,7 @@ pub fn rgb48_to_hsv_row( /// Converts one row of `Bgr48` to packed u8 RGB (B↔R swap, narrow via `>> 8`). /// `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_row( +pub fn bgr48_to_rgb_row_endian( bgr48: &[u16], rgb_out: &mut [u8], width: usize, @@ -370,10 +469,18 @@ pub fn bgr48_to_rgb_row( scalar::bgr48_to_rgb_row::(bgr48, rgb_out, width); } +/// LE-only wrapper around [`bgr48_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgb_row(bgr48: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + bgr48_to_rgb_row_endian::(bgr48, rgb_out, width, use_simd) +} + /// Converts one row of `Bgr48` to packed u8 RGBA (B↔R swap, alpha forced to /// `0xFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_row( +pub fn bgr48_to_rgba_row_endian( bgr48: &[u16], rgba_out: &mut [u8], width: usize, @@ -415,10 +522,18 @@ pub fn bgr48_to_rgba_row( scalar::bgr48_to_rgba_row::(bgr48, rgba_out, width); } +/// LE-only wrapper around [`bgr48_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgba_row(bgr48: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + bgr48_to_rgba_row_endian::(bgr48, rgba_out, width, use_simd) +} + /// Converts one row of `Bgr48` to native-depth u16 RGB (B↔R swap, values /// unchanged). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgb_u16_row( +pub fn bgr48_to_rgb_u16_row_endian( bgr48: &[u16], rgb_out: &mut [u16], width: usize, @@ -460,10 +575,18 @@ pub fn bgr48_to_rgb_u16_row( scalar::bgr48_to_rgb_u16_row::(bgr48, rgb_out, width); } +/// LE-only wrapper around [`bgr48_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgb_u16_row(bgr48: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + bgr48_to_rgb_u16_row_endian::(bgr48, rgb_out, width, use_simd) +} + /// Converts one row of `Bgr48` to native-depth u16 RGBA (B↔R swap, alpha /// forced to `0xFFFF`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr48_to_rgba_u16_row( +pub fn bgr48_to_rgba_u16_row_endian( bgr48: &[u16], rgba_out: &mut [u16], width: usize, @@ -505,11 +628,19 @@ pub fn bgr48_to_rgba_u16_row( scalar::bgr48_to_rgba_u16_row::(bgr48, rgba_out, width); } +/// LE-only wrapper around [`bgr48_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_rgba_u16_row(bgr48: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + bgr48_to_rgba_u16_row_endian::(bgr48, rgba_out, width, use_simd) +} + /// Derives 8-bit luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_row( +pub fn bgr48_to_luma_row_endian( bgr48: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -523,15 +654,40 @@ pub fn bgr48_to_luma_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgr48_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_luma_row( + bgr48: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgr48_to_luma_row_endian::( + bgr48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_luma_u16_row( +pub fn bgr48_to_luma_u16_row_endian( bgr48: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -545,15 +701,40 @@ pub fn bgr48_to_luma_u16_row( assert!(bgr48.len() >= in_min, "bgr48 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgr48_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_luma_u16_row( + bgr48: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgr48_to_luma_u16_row_endian::( + bgr48, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Bgr48` source. Narrows to u8 RGB via /// `bgr48_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgr48_to_hsv_row( +pub fn bgr48_to_hsv_row_endian( bgr48: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -569,10 +750,27 @@ pub fn bgr48_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgr48_to_rgb_row::(bgr48, rgb_scratch, width, use_simd); + bgr48_to_rgb_row_endian::(bgr48, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`bgr48_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgr48_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr48_to_hsv_row( + bgr48: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + bgr48_to_hsv_row_endian::(bgr48, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Rgba64 (R, G, B, A — 4 u16 elements per pixel, source alpha real) // ============================================================================= @@ -580,7 +778,7 @@ pub fn bgr48_to_hsv_row( /// Converts one row of `Rgba64` to packed u8 RGB. Source alpha is discarded; /// R/G/B narrowed via `>> 8`. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_row( +pub fn rgba64_to_rgb_row_endian( rgba64: &[u16], rgb_out: &mut [u8], width: usize, @@ -622,10 +820,18 @@ pub fn rgba64_to_rgb_row( scalar::rgba64_to_rgb_row::(rgba64, rgb_out, width); } +/// LE-only wrapper around [`rgba64_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgb_row(rgba64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + rgba64_to_rgb_row_endian::(rgba64, rgb_out, width, use_simd) +} + /// Converts one row of `Rgba64` to packed u8 RGBA. All 4 channels narrowed via /// `>> 8`; source alpha passes through. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_row( +pub fn rgba64_to_rgba_row_endian( rgba64: &[u16], rgba_out: &mut [u8], width: usize, @@ -667,10 +873,18 @@ pub fn rgba64_to_rgba_row( scalar::rgba64_to_rgba_row::(rgba64, rgba_out, width); } +/// LE-only wrapper around [`rgba64_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgba_row(rgba64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + rgba64_to_rgba_row_endian::(rgba64, rgba_out, width, use_simd) +} + /// Converts one row of `Rgba64` to native-depth u16 RGB. Source alpha /// discarded; R/G/B copied as-is. `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgb_u16_row( +pub fn rgba64_to_rgb_u16_row_endian( rgba64: &[u16], rgb_out: &mut [u16], width: usize, @@ -712,10 +926,18 @@ pub fn rgba64_to_rgb_u16_row( scalar::rgba64_to_rgb_u16_row::(rgba64, rgb_out, width); } +/// LE-only wrapper around [`rgba64_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgb_u16_row(rgba64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + rgba64_to_rgb_u16_row_endian::(rgba64, rgb_out, width, use_simd) +} + /// Converts one row of `Rgba64` to native-depth u16 RGBA (identity copy of all /// 4 channels; source alpha preserved). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgba64_to_rgba_u16_row( +pub fn rgba64_to_rgba_u16_row_endian( rgba64: &[u16], rgba_out: &mut [u16], width: usize, @@ -757,12 +979,20 @@ pub fn rgba64_to_rgba_u16_row( scalar::rgba64_to_rgba_u16_row::(rgba64, rgba_out, width); } +/// LE-only wrapper around [`rgba64_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_rgba_u16_row(rgba64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + rgba64_to_rgba_u16_row_endian::(rgba64, rgba_out, width, use_simd) +} + /// Derives 8-bit luma from one row of `Rgba64` source. Narrows to u8 RGB via /// `rgba64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_row( +pub fn rgba64_to_luma_row_endian( rgba64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -776,16 +1006,41 @@ pub fn rgba64_to_luma_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgba64_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_luma_row( + rgba64: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgba64_to_luma_row_endian::( + rgba64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Rgba64` source. Narrows to u8 RGB via /// `rgba64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_luma_u16_row( +pub fn rgba64_to_luma_u16_row_endian( rgba64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -799,16 +1054,41 @@ pub fn rgba64_to_luma_u16_row( assert!(rgba64.len() >= in_min, "rgba64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`rgba64_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_luma_u16_row( + rgba64: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + rgba64_to_luma_u16_row_endian::( + rgba64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Rgba64` source. Narrows to u8 RGB via /// `rgba64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn rgba64_to_hsv_row( +pub fn rgba64_to_hsv_row_endian( rgba64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -824,10 +1104,27 @@ pub fn rgba64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - rgba64_to_rgb_row::(rgba64, rgb_scratch, width, use_simd); + rgba64_to_rgb_row_endian::(rgba64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`rgba64_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `rgba64_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgba64_to_hsv_row( + rgba64: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + rgba64_to_hsv_row_endian::(rgba64, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Bgra64 (B, G, R, A — 4 u16 elements per pixel, source alpha real) // ============================================================================= @@ -835,7 +1132,7 @@ pub fn rgba64_to_hsv_row( /// Converts one row of `Bgra64` to packed u8 RGB (B↔R swap, drop alpha, /// narrow via `>> 8`). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_row( +pub fn bgra64_to_rgb_row_endian( bgra64: &[u16], rgb_out: &mut [u8], width: usize, @@ -877,11 +1174,19 @@ pub fn bgra64_to_rgb_row( scalar::bgra64_to_rgb_row::(bgra64, rgb_out, width); } +/// LE-only wrapper around [`bgra64_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgb_row(bgra64: &[u16], rgb_out: &mut [u8], width: usize, use_simd: bool) { + bgra64_to_rgb_row_endian::(bgra64, rgb_out, width, use_simd) +} + /// Converts one row of `Bgra64` to packed u8 RGBA (B↔R swap, all 4 channels /// narrowed via `>> 8`; source alpha passes through). `use_simd = false` forces /// the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_row( +pub fn bgra64_to_rgba_row_endian( bgra64: &[u16], rgba_out: &mut [u8], width: usize, @@ -923,10 +1228,18 @@ pub fn bgra64_to_rgba_row( scalar::bgra64_to_rgba_row::(bgra64, rgba_out, width); } +/// LE-only wrapper around [`bgra64_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgba_row(bgra64: &[u16], rgba_out: &mut [u8], width: usize, use_simd: bool) { + bgra64_to_rgba_row_endian::(bgra64, rgba_out, width, use_simd) +} + /// Converts one row of `Bgra64` to native-depth u16 RGB (B↔R swap, drop alpha, /// values copied as-is). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgb_u16_row( +pub fn bgra64_to_rgb_u16_row_endian( bgra64: &[u16], rgb_out: &mut [u16], width: usize, @@ -968,10 +1281,18 @@ pub fn bgra64_to_rgb_u16_row( scalar::bgra64_to_rgb_u16_row::(bgra64, rgb_out, width); } +/// LE-only wrapper around [`bgra64_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgb_u16_row(bgra64: &[u16], rgb_out: &mut [u16], width: usize, use_simd: bool) { + bgra64_to_rgb_u16_row_endian::(bgra64, rgb_out, width, use_simd) +} + /// Converts one row of `Bgra64` to native-depth u16 RGBA (B↔R swap; source /// alpha preserved at position 3). `use_simd = false` forces the scalar path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgra64_to_rgba_u16_row( +pub fn bgra64_to_rgba_u16_row_endian( bgra64: &[u16], rgba_out: &mut [u16], width: usize, @@ -1013,12 +1334,20 @@ pub fn bgra64_to_rgba_u16_row( scalar::bgra64_to_rgba_u16_row::(bgra64, rgba_out, width); } +/// LE-only wrapper around [`bgra64_to_rgba_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_rgba_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_rgba_u16_row(bgra64: &[u16], rgba_out: &mut [u16], width: usize, use_simd: bool) { + bgra64_to_rgba_u16_row_endian::(bgra64, rgba_out, width, use_simd) +} + /// Derives 8-bit luma from one row of `Bgra64` source. Narrows to u8 RGB via /// `bgra64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_row( +pub fn bgra64_to_luma_row_endian( bgra64: &[u16], luma_out: &mut [u8], rgb_scratch: &mut [u8], @@ -1032,16 +1361,41 @@ pub fn bgra64_to_luma_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgra64_to_luma_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_luma_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_luma_row( + bgra64: &[u16], + luma_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgra64_to_luma_row_endian::( + bgra64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives u16 luma from one row of `Bgra64` source. Narrows to u8 RGB via /// `bgra64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_luma_u16_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_luma_u16_row( +pub fn bgra64_to_luma_u16_row_endian( bgra64: &[u16], luma_out: &mut [u16], rgb_scratch: &mut [u8], @@ -1055,16 +1409,41 @@ pub fn bgra64_to_luma_u16_row( assert!(bgra64.len() >= in_min, "bgra64 row too short"); assert!(rgb_scratch.len() >= scratch_min, "rgb_scratch too short"); assert!(luma_out.len() >= width, "luma_out row too short"); - bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_luma_u16_row(rgb_scratch, luma_out, width, matrix, full_range); } +/// LE-only wrapper around [`bgra64_to_luma_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_luma_u16_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_luma_u16_row( + bgra64: &[u16], + luma_out: &mut [u16], + rgb_scratch: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + bgra64_to_luma_u16_row_endian::( + bgra64, + luma_out, + rgb_scratch, + width, + matrix, + full_range, + use_simd, + ) +} + /// Derives planar HSV from one row of `Bgra64` source. Narrows to u8 RGB via /// `bgra64_to_rgb_row` into `rgb_scratch`, then applies `rgb_to_hsv_row`. /// Source alpha is discarded. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn bgra64_to_hsv_row( +pub fn bgra64_to_hsv_row_endian( bgra64: &[u16], h_out: &mut [u8], s_out: &mut [u8], @@ -1080,10 +1459,27 @@ pub fn bgra64_to_hsv_row( assert!(h_out.len() >= width, "h_out row too short"); assert!(s_out.len() >= width, "s_out row too short"); assert!(v_out.len() >= width, "v_out row too short"); - bgra64_to_rgb_row::(bgra64, rgb_scratch, width, use_simd); + bgra64_to_rgb_row_endian::(bgra64, rgb_scratch, width, use_simd); scalar::rgb_to_hsv_row(rgb_scratch, h_out, s_out, v_out, width); } +/// LE-only wrapper around [`bgra64_to_hsv_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `bgra64_to_hsv_row_endian::(...)`. +#[allow(clippy::too_many_arguments)] +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgra64_to_hsv_row( + bgra64: &[u16], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + rgb_scratch: &mut [u8], + width: usize, + use_simd: bool, +) { + bgra64_to_hsv_row_endian::(bgra64, h_out, s_out, v_out, rgb_scratch, width, use_simd) +} + // ============================================================================= // Tests // ============================================================================= @@ -1115,7 +1511,7 @@ mod tests { // All-white Rgb48: each u16 channel = 0xFFFF; narrowed >> 8 = 0xFF. let src = solid_rgb48(4, 0xFFFF); let mut rgb = std::vec![0u8; 4 * 3]; - rgb48_to_rgb_row::(&src, &mut rgb, 4, false); + rgb48_to_rgb_row_endian::(&src, &mut rgb, 4, false); assert!( rgb.iter().all(|&v| v == 0xFF), "expected all 0xFF, got {rgb:?}" @@ -1126,7 +1522,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_scalar_path() { let src = solid_rgb48(4, 0x1200); let mut rgba = std::vec![0u8; 4 * 4]; - rgb48_to_rgba_row::(&src, &mut rgba, 4, false); + rgb48_to_rgba_row_endian::(&src, &mut rgba, 4, false); for px in rgba.chunks(4) { assert_eq!(px[0], 0x12, "R channel"); assert_eq!(px[3], 0xFF, "alpha forced to 0xFF"); @@ -1137,7 +1533,7 @@ mod tests { fn rgb48_dispatcher_to_rgb_u16_scalar_path() { let src = solid_rgb48(4, 0xABCD); let mut rgb_u16 = std::vec![0u16; 4 * 3]; - rgb48_to_rgb_u16_row::(&src, &mut rgb_u16, 4, false); + rgb48_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 4, false); assert!( rgb_u16.iter().all(|&v| v == 0xABCD), "expected identity copy" @@ -1148,7 +1544,7 @@ mod tests { fn rgb48_dispatcher_to_rgba_u16_scalar_path() { let src = solid_rgb48(4, 0x1234); let mut rgba_u16 = std::vec![0u16; 4 * 4]; - rgb48_to_rgba_u16_row::(&src, &mut rgba_u16, 4, false); + rgb48_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 4, false); for px in rgba_u16.chunks(4) { assert_eq!(px[0], 0x1234, "R channel"); assert_eq!(px[3], 0xFFFF, "alpha forced to 0xFFFF"); @@ -1161,7 +1557,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgb48_to_luma_row::( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1180,7 +1576,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgb48_to_luma_u16_row::( + rgb48_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1205,7 +1601,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgb48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgb48_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 0, "H for pure red must be 0"); assert_eq!(s[0], 255, "S for pure red must be 255"); assert!(v[0] >= 254, "V for pure red must be near 255, got {}", v[0]); @@ -1218,7 +1614,7 @@ mod tests { // Bgr48 pixel [B=0x1100, G=0x2200, R=0x3300] → rgb [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300]; let mut rgb = [0u8; 3]; - bgr48_to_rgb_row::(&src, &mut rgb, 1, false); + bgr48_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1228,7 +1624,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_scalar_path() { let src = [0x1100u16, 0x2200, 0x3300]; let mut rgba = [0u8; 4]; - bgr48_to_rgba_row::(&src, &mut rgba, 1, false); + bgr48_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R"); assert_eq!(rgba[3], 0xFF, "alpha forced to 0xFF"); } @@ -1237,7 +1633,7 @@ mod tests { fn bgr48_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgb_u16 = [0u16; 3]; - bgr48_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); + bgr48_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1247,7 +1643,7 @@ mod tests { fn bgr48_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333]; // B, G, R let mut rgba_u16 = [0u16; 4]; - bgr48_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); + bgr48_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R"); assert_eq!(rgba_u16[3], 0xFFFF, "alpha forced to 0xFFFF"); } @@ -1257,7 +1653,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); // all channels = 0xFF00 let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgr48_to_luma_row::( + bgr48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1276,7 +1672,7 @@ mod tests { let src = solid_rgb48(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgr48_to_luma_u16_row::( + bgr48_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1299,7 +1695,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgr48_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgr48_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1316,7 +1712,7 @@ mod tests { // Source alpha should be dropped; R/G/B narrowed. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; // R, G, B, A let mut rgb = [0u8; 3]; - rgba64_to_rgb_row::(&src, &mut rgb, 1, false); + rgba64_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x11, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x33, "B"); @@ -1327,7 +1723,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - rgba64_to_rgba_row::(&src, &mut rgba, 1, false); + rgba64_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1335,7 +1731,7 @@ mod tests { fn rgba64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; let mut rgb_u16 = [0u16; 3]; - rgba64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); + rgba64_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x1111, "R"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x3333, "B"); @@ -1346,7 +1742,7 @@ mod tests { // Identity copy; source alpha preserved. let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; let mut rgba_u16 = [0u16; 4]; - rgba64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); + rgba64_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x1111, "R"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1357,7 +1753,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - rgba64_to_luma_row::( + rgba64_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1376,7 +1772,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - rgba64_to_luma_u16_row::( + rgba64_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1401,7 +1797,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - rgba64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + rgba64_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 60, "H for pure green must be 60 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure green must be 255"); assert!( @@ -1418,7 +1814,7 @@ mod tests { // Bgra64: B=0x1100, G=0x2200, R=0x3300, A=0xDEAD → RGB [R=0x33, G=0x22, B=0x11]. let src = [0x1100u16, 0x2200, 0x3300, 0xDEAD]; let mut rgb = [0u8; 3]; - bgra64_to_rgb_row::(&src, &mut rgb, 1, false); + bgra64_to_rgb_row_endian::(&src, &mut rgb, 1, false); assert_eq!(rgb[0], 0x33, "R"); assert_eq!(rgb[1], 0x22, "G"); assert_eq!(rgb[2], 0x11, "B"); @@ -1429,7 +1825,7 @@ mod tests { // Source alpha 0xABCD → 0xAB after >> 8; channels swapped. let src = [0x1100u16, 0x2200, 0x3300, 0xABCD]; let mut rgba = [0u8; 4]; - bgra64_to_rgba_row::(&src, &mut rgba, 1, false); + bgra64_to_rgba_row_endian::(&src, &mut rgba, 1, false); assert_eq!(rgba[0], 0x33, "R (from position 2)"); assert_eq!(rgba[3], 0xAB, "source alpha depth-converted >> 8"); } @@ -1438,7 +1834,7 @@ mod tests { fn bgra64_dispatcher_to_rgb_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xDEAD]; // B, G, R, A let mut rgb_u16 = [0u16; 3]; - bgra64_to_rgb_u16_row::(&src, &mut rgb_u16, 1, false); + bgra64_to_rgb_u16_row_endian::(&src, &mut rgb_u16, 1, false); assert_eq!(rgb_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgb_u16[1], 0x2222, "G"); assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); @@ -1448,7 +1844,7 @@ mod tests { fn bgra64_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; // B, G, R, A let mut rgba_u16 = [0u16; 4]; - bgra64_to_rgba_u16_row::(&src, &mut rgba_u16, 1, false); + bgra64_to_rgba_u16_row_endian::(&src, &mut rgba_u16, 1, false); assert_eq!(rgba_u16[0], 0x3333, "R (from position 2)"); assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } @@ -1458,7 +1854,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u8; 4]; - bgra64_to_luma_row::( + bgra64_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1477,7 +1873,7 @@ mod tests { let src = solid_rgba64(4, 0xFF00); let mut scratch = std::vec![0u8; 4 * 3]; let mut luma = std::vec![0u16; 4]; - bgra64_to_luma_u16_row::( + bgra64_to_luma_u16_row_endian::( &src, &mut luma, &mut scratch, @@ -1503,7 +1899,7 @@ mod tests { let mut h = std::vec![0u8; 1]; let mut s = std::vec![0u8; 1]; let mut v = std::vec![0u8; 1]; - bgra64_to_hsv_row::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); + bgra64_to_hsv_row_endian::(&src, &mut h, &mut s, &mut v, &mut scratch, 1, false); assert_eq!(h[0], 120, "H for pure blue must be 120 in OpenCV encoding"); assert_eq!(s[0], 255, "S for pure blue must be 255"); assert!( @@ -1520,7 +1916,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_input() { let src = [0u16; 2]; // needs 3 for width=1 let mut out = [0u8; 3]; - rgb48_to_rgb_row::(&src, &mut out, 1, false); + rgb48_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1528,7 +1924,7 @@ mod tests { fn rgb48_to_rgb_row_rejects_short_output() { let src = [0u16; 3]; let mut out = [0u8; 2]; // needs 3 - rgb48_to_rgb_row::(&src, &mut out, 1, false); + rgb48_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1536,7 +1932,7 @@ mod tests { fn rgba64_to_rgb_row_rejects_short_input() { let src = [0u16; 3]; // needs 4 for width=1 let mut out = [0u8; 3]; - rgba64_to_rgb_row::(&src, &mut out, 1, false); + rgba64_to_rgb_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1544,7 +1940,7 @@ mod tests { fn rgba64_to_rgba_row_rejects_short_output() { let src = [0u16; 4]; let mut out = [0u8; 3]; // needs 4 - rgba64_to_rgba_row::(&src, &mut out, 1, false); + rgba64_to_rgba_row_endian::(&src, &mut out, 1, false); } #[test] @@ -1553,7 +1949,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 3]; let mut luma: [u8; 0] = []; - rgb48_to_luma_row::( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1570,7 +1966,7 @@ mod tests { let src = [0u16; 3]; let mut scratch = [0u8; 2]; // needs 3 let mut luma = [0u8; 1]; - rgb48_to_luma_row::( + rgb48_to_luma_row_endian::( &src, &mut luma, &mut scratch, @@ -1601,7 +1997,7 @@ mod tests { fn rgb48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgb48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + rgb48_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1610,7 +2006,7 @@ mod tests { fn bgr48_dispatcher_rejects_width_times_3_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgr48_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); + bgr48_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_3, false); } #[cfg(target_pointer_width = "32")] @@ -1619,7 +2015,7 @@ mod tests { fn rgba64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - rgba64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + rgba64_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } #[cfg(target_pointer_width = "32")] @@ -1628,6 +2024,6 @@ mod tests { fn bgra64_dispatcher_rejects_width_times_4_overflow() { let p: [u16; 0] = []; let mut out: [u8; 0] = []; - bgra64_to_rgb_row::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); + bgra64_to_rgb_row_endian::(&p, &mut out, OVERFLOW_WIDTH_TIMES_4, false); } } diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index eedf8b2a..f2c1dd89 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -948,7 +948,7 @@ pub fn bgrx_to_rgba_row(bgrx: &[u8], rgba_out: &mut [u8], width: usize, use_simd /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_row( +pub fn x2rgb10_to_rgb_row_endian( x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, @@ -993,12 +993,20 @@ pub fn x2rgb10_to_rgb_row( scalar::x2rgb10_to_rgb_row::(x2rgb10, rgb_out, width); } +/// LE-only wrapper around [`x2rgb10_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgb_row(x2rgb10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { + x2rgb10_to_rgb_row_endian::(x2rgb10, rgb_out, width, use_simd) +} + /// Drops the 2-bit padding, down-shifts to 8 bits, and forces alpha /// to `0xFF` from `X2RGB10` LE input. Output: packed `R, G, B, A`. /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgba_row( +pub fn x2rgb10_to_rgba_row_endian( x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, @@ -1042,13 +1050,21 @@ pub fn x2rgb10_to_rgba_row( scalar::x2rgb10_to_rgba_row::(x2rgb10, rgba_out, width); } +/// LE-only wrapper around [`x2rgb10_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgba_row(x2rgb10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { + x2rgb10_to_rgba_row_endian::(x2rgb10, rgba_out, width, use_simd) +} + /// Extracts each 10-bit channel into native-depth `u16` (low-bit /// aligned, max value `1023`) from `X2RGB10` LE input. Output: /// packed `R, G, B` `u16` elements. /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2rgb10_to_rgb_u16_row( +pub fn x2rgb10_to_rgb_u16_row_endian( x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, @@ -1095,11 +1111,19 @@ pub fn x2rgb10_to_rgb_u16_row( scalar::x2rgb10_to_rgb_u16_row::(x2rgb10, rgb_out, width); } +/// LE-only wrapper around [`x2rgb10_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2rgb10_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2rgb10_to_rgb_u16_row(x2rgb10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { + x2rgb10_to_rgb_u16_row_endian::(x2rgb10, rgb_out, width, use_simd) +} + /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_row`]. Channel /// positions in the source `u32` are reversed; output is still /// `R, G, B`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_row( +pub fn x2bgr10_to_rgb_row_endian( x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, @@ -1144,9 +1168,17 @@ pub fn x2bgr10_to_rgb_row( scalar::x2bgr10_to_rgb_row::(x2bgr10, rgb_out, width); } +/// LE-only wrapper around [`x2bgr10_to_rgb_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgb_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgb_row(x2bgr10: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { + x2bgr10_to_rgb_row_endian::(x2bgr10, rgb_out, width, use_simd) +} + /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgba_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgba_row( +pub fn x2bgr10_to_rgba_row_endian( x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, @@ -1190,9 +1222,17 @@ pub fn x2bgr10_to_rgba_row( scalar::x2bgr10_to_rgba_row::(x2bgr10, rgba_out, width); } +/// LE-only wrapper around [`x2bgr10_to_rgba_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgba_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgba_row(x2bgr10: &[u8], rgba_out: &mut [u8], width: usize, use_simd: bool) { + x2bgr10_to_rgba_row_endian::(x2bgr10, rgba_out, width, use_simd) +} + /// `X2BGR10` LE counterpart of [`x2rgb10_to_rgb_u16_row`]. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn x2bgr10_to_rgb_u16_row( +pub fn x2bgr10_to_rgb_u16_row_endian( x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, @@ -1237,3 +1277,11 @@ pub fn x2bgr10_to_rgb_u16_row( } scalar::x2bgr10_to_rgb_u16_row::(x2bgr10, rgb_out, width); } + +/// LE-only wrapper around [`x2bgr10_to_rgb_u16_row_endian`]; preserves the pre-endian- +/// generic public signature so existing little-endian callers compile +/// unchanged. Equivalent to `x2bgr10_to_rgb_u16_row_endian::(...)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn x2bgr10_to_rgb_u16_row(x2bgr10: &[u8], rgb_out: &mut [u16], width: usize, use_simd: bool) { + x2bgr10_to_rgb_u16_row_endian::(x2bgr10, rgb_out, width, use_simd) +} diff --git a/src/row/mod.rs b/src/row/mod.rs index 714f0e4c..b2502de8 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -91,18 +91,34 @@ pub(crate) use dispatch::mono1bit::*; // parameter) are re-exported as `pub(crate)` for sinker use — the underlying // functions in `dispatch::packed_rgb_16bit` are `pub`, but only this // re-export visibility is visible outside the crate. +// +// Each function exists in two forms: +// - `foo` — backwards-compatible LE-only wrapper (no const generic), preserves +// the pre-Tier 8 public signature so existing little-endian downstream +// callers compile unchanged. +// - `foo_endian::` — endian-aware form (added in Tier 8 for +// the BE-on-BE-host plane contract). Used by sinker code internally. pub use dispatch::packed_rgb_16bit::{ - bgr48_to_rgb_row, bgr48_to_rgb_u16_row, bgr48_to_rgba_row, bgr48_to_rgba_u16_row, - bgra64_to_rgb_row, bgra64_to_rgb_u16_row, bgra64_to_rgba_row, bgra64_to_rgba_u16_row, - rgb48_to_rgb_row, rgb48_to_rgb_u16_row, rgb48_to_rgba_row, rgb48_to_rgba_u16_row, - rgba64_to_rgb_row, rgba64_to_rgb_u16_row, rgba64_to_rgba_row, rgba64_to_rgba_u16_row, + bgr48_to_rgb_row, bgr48_to_rgb_row_endian, bgr48_to_rgb_u16_row, bgr48_to_rgb_u16_row_endian, + bgr48_to_rgba_row, bgr48_to_rgba_row_endian, bgr48_to_rgba_u16_row, bgr48_to_rgba_u16_row_endian, + bgra64_to_rgb_row, bgra64_to_rgb_row_endian, bgra64_to_rgb_u16_row, bgra64_to_rgb_u16_row_endian, + bgra64_to_rgba_row, bgra64_to_rgba_row_endian, bgra64_to_rgba_u16_row, + bgra64_to_rgba_u16_row_endian, rgb48_to_rgb_row, rgb48_to_rgb_row_endian, rgb48_to_rgb_u16_row, + rgb48_to_rgb_u16_row_endian, rgb48_to_rgba_row, rgb48_to_rgba_row_endian, rgb48_to_rgba_u16_row, + rgb48_to_rgba_u16_row_endian, rgba64_to_rgb_row, rgba64_to_rgb_row_endian, rgba64_to_rgb_u16_row, + rgba64_to_rgb_u16_row_endian, rgba64_to_rgba_row, rgba64_to_rgba_row_endian, + rgba64_to_rgba_u16_row, rgba64_to_rgba_u16_row_endian, }; // luma + HSV variants take an extra rgb_scratch parameter — sinker wired in Task 9. #[allow(unused_imports)] pub(crate) use dispatch::packed_rgb_16bit::{ - bgr48_to_hsv_row, bgr48_to_luma_row, bgr48_to_luma_u16_row, bgra64_to_hsv_row, - bgra64_to_luma_row, bgra64_to_luma_u16_row, rgb48_to_hsv_row, rgb48_to_luma_row, - rgb48_to_luma_u16_row, rgba64_to_hsv_row, rgba64_to_luma_row, rgba64_to_luma_u16_row, + bgr48_to_hsv_row, bgr48_to_hsv_row_endian, bgr48_to_luma_row, bgr48_to_luma_row_endian, + bgr48_to_luma_u16_row, bgr48_to_luma_u16_row_endian, bgra64_to_hsv_row, bgra64_to_hsv_row_endian, + bgra64_to_luma_row, bgra64_to_luma_row_endian, bgra64_to_luma_u16_row, + bgra64_to_luma_u16_row_endian, rgb48_to_hsv_row, rgb48_to_hsv_row_endian, rgb48_to_luma_row, + rgb48_to_luma_row_endian, rgb48_to_luma_u16_row, rgb48_to_luma_u16_row_endian, rgba64_to_hsv_row, + rgba64_to_hsv_row_endian, rgba64_to_luma_row, rgba64_to_luma_row_endian, rgba64_to_luma_u16_row, + rgba64_to_luma_u16_row_endian, }; // Gray dispatchers are pub(crate) — sinker code uses them via crate::row::gray*_row. #[cfg(any(feature = "std", feature = "alloc"))] diff --git a/src/sinker/mixed/packed_rgb_10bit.rs b/src/sinker/mixed/packed_rgb_10bit.rs index ea0a62f5..2e171c2c 100644 --- a/src/sinker/mixed/packed_rgb_10bit.rs +++ b/src/sinker/mixed/packed_rgb_10bit.rs @@ -32,8 +32,9 @@ use super::{ use crate::{ PixelSink, row::{ - rgb_to_hsv_row, rgb_to_luma_row, x2bgr10_to_rgb_row, x2bgr10_to_rgb_u16_row, - x2bgr10_to_rgba_row, x2rgb10_to_rgb_row, x2rgb10_to_rgb_u16_row, x2rgb10_to_rgba_row, + rgb_to_hsv_row, rgb_to_luma_row, x2bgr10_to_rgb_row_endian, x2bgr10_to_rgb_u16_row_endian, + x2bgr10_to_rgba_row_endian, x2rgb10_to_rgb_row_endian, x2rgb10_to_rgb_u16_row_endian, + x2rgb10_to_rgba_row_endian, }, yuv::{X2Bgr10, X2Bgr10Row, X2Bgr10Sink, X2Rgb10, X2Rgb10Row, X2Rgb10Sink}, }; @@ -149,7 +150,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { w, h, )?; - x2rgb10_to_rgb_row::(x2rgb10_in, rgb_row, w, use_simd); + x2rgb10_to_rgb_row_endian::(x2rgb10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -177,7 +178,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { // u8 RGBA output (single-pass, dedicated kernel forces alpha). if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2rgb10_to_rgba_row::(x2rgb10_in, rgba_row, w, use_simd); + x2rgb10_to_rgba_row_endian::(x2rgb10_in, rgba_row, w, use_simd); } // u16 native RGB output (10-bit precision preserved). @@ -193,7 +194,7 @@ impl PixelSink for MixedSinker<'_, X2Rgb10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2rgb10_to_rgb_u16_row::(x2rgb10_in, rgb_u16_row, w, use_simd); + x2rgb10_to_rgb_u16_row_endian::(x2rgb10_in, rgb_u16_row, w, use_simd); } Ok(()) @@ -307,7 +308,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { w, h, )?; - x2bgr10_to_rgb_row::(x2bgr10_in, rgb_row, w, use_simd); + x2bgr10_to_rgb_row_endian::(x2bgr10_in, rgb_row, w, use_simd); if let Some(luma) = luma.as_deref_mut() { rgb_to_luma_row( @@ -334,7 +335,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, one_plane_start, one_plane_end, w, h)?; - x2bgr10_to_rgba_row::(x2bgr10_in, rgba_row, w, use_simd); + x2bgr10_to_rgba_row_endian::(x2bgr10_in, rgba_row, w, use_simd); } if want_rgb_u16 { @@ -349,7 +350,7 @@ impl PixelSink for MixedSinker<'_, X2Bgr10> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - x2bgr10_to_rgb_u16_row::(x2bgr10_in, rgb_u16_row, w, use_simd); + x2bgr10_to_rgb_u16_row_endian::(x2bgr10_in, rgb_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/packed_rgb_16bit.rs b/src/sinker/mixed/packed_rgb_16bit.rs index e7ad27cd..1fa4a023 100644 --- a/src/sinker/mixed/packed_rgb_16bit.rs +++ b/src/sinker/mixed/packed_rgb_16bit.rs @@ -35,12 +35,13 @@ use super::{ use crate::{ PixelSink, row::{ - bgr48_to_rgb_row, bgr48_to_rgb_u16_row, bgr48_to_rgba_row, bgr48_to_rgba_u16_row, - bgra64_to_rgb_row, bgra64_to_rgb_u16_row, bgra64_to_rgba_row, bgra64_to_rgba_u16_row, - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, rgb_to_luma_row, - rgb_to_luma_u16_row, rgb48_to_rgb_row, rgb48_to_rgb_u16_row, rgb48_to_rgba_row, - rgb48_to_rgba_u16_row, rgba64_to_rgb_row, rgba64_to_rgb_u16_row, rgba64_to_rgba_row, - rgba64_to_rgba_u16_row, + bgr48_to_rgb_row_endian, bgr48_to_rgb_u16_row_endian, bgr48_to_rgba_row_endian, + bgr48_to_rgba_u16_row_endian, bgra64_to_rgb_row_endian, bgra64_to_rgb_u16_row_endian, + bgra64_to_rgba_row_endian, bgra64_to_rgba_u16_row_endian, expand_rgb_to_rgba_row, + expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, rgb_to_luma_row, rgb_to_luma_u16_row, + rgb48_to_rgb_row_endian, rgb48_to_rgb_u16_row_endian, rgb48_to_rgba_row_endian, + rgb48_to_rgba_u16_row_endian, rgba64_to_rgb_row_endian, rgba64_to_rgb_u16_row_endian, + rgba64_to_rgba_row_endian, rgba64_to_rgba_u16_row_endian, }, yuv::{ Bgr48, Bgr48Row, Bgr48Sink, Bgra64, Bgra64Row, Bgra64Sink, Rgb48, Rgb48Row, Rgb48Sink, Rgba64, @@ -206,7 +207,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // with_luma_u16, or with_hsv is attached. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgb48_to_rgb_row::(in48, rgb_row, w, use_simd); + rgb48_to_rgb_row_endian::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -245,7 +246,7 @@ impl PixelSink for MixedSinker<'_, Rgb48> { // u8 RGBA — single-pass kernel, alpha forced to 0xFF. if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_row::(in48, rgba_row, w, use_simd); + rgb48_to_rgba_row_endian::(in48, rgba_row, w, use_simd); } // u16 RGB — native passthrough. @@ -257,13 +258,13 @@ impl PixelSink for MixedSinker<'_, Rgb48> { height: h, channels: 3, })?; - rgb48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); + rgb48_to_rgb_u16_row_endian::(in48, &mut buf[ps * 3..end], w, use_simd); } // u16 RGBA — native passthrough, alpha forced to 0xFFFF. if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - rgb48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); + rgb48_to_rgba_u16_row_endian::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -426,7 +427,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgr48_to_rgb_row::(in48, rgb_row, w, use_simd); + bgr48_to_rgb_row_endian::(in48, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -464,7 +465,7 @@ impl PixelSink for MixedSinker<'_, Bgr48> { if let Some(buf) = rgba.as_deref_mut() { let rgba_row = rgba_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_row::(in48, rgba_row, w, use_simd); + bgr48_to_rgba_row_endian::(in48, rgba_row, w, use_simd); } if let Some(buf) = rgb_u16.as_deref_mut() { @@ -475,12 +476,12 @@ impl PixelSink for MixedSinker<'_, Bgr48> { height: h, channels: 3, })?; - bgr48_to_rgb_u16_row::(in48, &mut buf[ps * 3..end], w, use_simd); + bgr48_to_rgb_u16_row_endian::(in48, &mut buf[ps * 3..end], w, use_simd); } if let Some(buf) = rgba_u16.as_deref_mut() { let rgba_u16_row = rgba_u16_plane_row_slice(buf, ps, pe, w, h)?; - bgr48_to_rgba_u16_row::(in48, rgba_u16_row, w, use_simd); + bgr48_to_rgba_u16_row_endian::(in48, rgba_u16_row, w, use_simd); } Ok(()) @@ -667,7 +668,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -675,7 +676,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); return Ok(()); } @@ -683,7 +684,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { // and Strategy A+ RGBA fan-out. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - rgba64_to_rgb_row::(in64, rgb_row, w, use_simd); + rgba64_to_rgb_row_endian::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -739,7 +740,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - rgba64_to_rgba_row::(in64, rgba_row, w, use_simd); + rgba64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); } // ===== u16 path ===== @@ -754,7 +755,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - rgba64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); + rgba64_to_rgb_u16_row_endian::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached — derive from the // just-computed u16 RGB row (writes α=0xFFFF), then overwrite α @@ -778,7 +779,7 @@ impl PixelSink for MixedSinker<'_, Rgba64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - rgba64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); + rgba64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); } Ok(()) @@ -950,7 +951,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb && !want_rgb_u16 && !want_rgba_u16 { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); return Ok(()); } @@ -958,14 +959,14 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 && !need_u8_rgb && !want_rgba { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); return Ok(()); } // u8 RGB staging path. if need_u8_rgb { let rgb_row = rgb_row_buf_or_scratch(rgb.as_deref_mut(), rgb_scratch, ps, pe, w, h)?; - bgra64_to_rgb_row::(in64, rgb_row, w, use_simd); + bgra64_to_rgb_row_endian::(in64, rgb_row, w, use_simd); if let Some(luma_buf) = luma.as_deref_mut() { rgb_to_luma_row( @@ -1017,7 +1018,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba && !need_u8_rgb { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, ps, pe, w, h)?; - bgra64_to_rgba_row::(in64, rgba_row, w, use_simd); + bgra64_to_rgba_row_endian::(in64, rgba_row, w, use_simd); } // u16 RGB path. @@ -1031,7 +1032,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { channels: 3, })?; let rgb_u16_row = &mut rgb_u16_buf[ps * 3..end]; - bgra64_to_rgb_u16_row::(in64, rgb_u16_row, w, use_simd); + bgra64_to_rgb_u16_row_endian::(in64, rgb_u16_row, w, use_simd); // Strategy A+ u16: RGBA u16 also attached. if want_rgba_u16 { @@ -1052,7 +1053,7 @@ impl PixelSink for MixedSinker<'_, Bgra64> { if want_rgba_u16 && !want_rgb_u16 { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, ps, pe, w, h)?; - bgra64_to_rgba_u16_row::(in64, rgba_u16_row, w, use_simd); + bgra64_to_rgba_u16_row_endian::(in64, rgba_u16_row, w, use_simd); } Ok(()) diff --git a/src/sinker/mixed/tests/packed_rgb_16bit.rs b/src/sinker/mixed/tests/packed_rgb_16bit.rs index 17d506d4..eb508f5f 100644 --- a/src/sinker/mixed/tests/packed_rgb_16bit.rs +++ b/src/sinker/mixed/tests/packed_rgb_16bit.rs @@ -417,3 +417,64 @@ fn rgb48_multi_row_frame() { assert_eq!(out[10], 0xFF); assert_eq!(out[11], 0xFF); } + +// ---- BE-contract regression ----------------------------------------------- + +/// Rgb48 sinker LE-encoded plane decodes correctly on every host. +/// +/// The frame doc-comment contract (see `src/frame/packed_rgb_16bit.rs`) says +/// the `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +/// `u16` (matching FFmpeg's `*LE` suffix). On a little-endian host LE bytes +/// are host-native — identity. On a big-endian host the bytes are swapped +/// relative to host-native, so the kernel must apply `u16::from_le` (kernel +/// generic `BE = false`) to recover the host-native sample before arithmetic. +/// +/// This test builds the plane from LE-encoded u16 patterns +/// (`intended.to_le()` on each sample) and asserts the sinker output matches +/// the host-native `intended` values bit-exact via the `with_rgb_u16` +/// (identity) path. On a BE host with a regressed pre-swap (caller swaps, +/// kernel swaps again → double swap) this would corrupt every sample. +/// +/// Forces `with_simd(false)` so this test runs purely scalar — no SIMD +/// intrinsics — which lets it execute under `cargo miri test`. BE CI is +/// driven by miri on s390x / powerpc64; gating it out of miri would skip +/// exactly the host where BE corruption would surface. +/// +/// Mirrors the `rgbf32_sinker_le_encoded_frame_decodes_correctly` pattern +/// added in PR #92's `5b42065` / `3b1d716`. +#[test] +fn rgb48_sinker_le_encoded_frame_decodes_correctly() { + // Mix high / mid / low / asymmetric byte patterns so any byte-swap regression + // shows up as a non-trivial mismatch (not just a no-op pattern). + let intended: Vec = (0..16 * 4 * 3) + .map(|i| match i % 4 { + 0 => 0x1234, + 1 => 0xABCD, + 2 => 0x00FF, + _ => 0xFF00, + }) + .collect(); + // Construct the plane as LE-encoded u16 (the documented `*LE` Frame + // contract). On LE host this is identity; on BE host the bit-pattern is + // byte-swapped so the kernel must `from_le` it back to host-native. + let pix: Vec = intended.iter().map(|&v| v.to_le()).collect(); + let src = Rgb48Frame::try_new(&pix, 16, 4, 16 * 3).unwrap(); + + // `with_rgb_u16` is the identity passthrough — the cleanest probe of the + // endian contract because no narrowing or arithmetic obscures the bit + // pattern. A single mismatched sample byte-swap would be unmissable. + let mut rgb_u16_out = vec![0u16; 16 * 4 * 3]; + let mut sink = MixedSinker::::new(16, 4) + .with_simd(false) + .with_rgb_u16(&mut rgb_u16_out) + .unwrap(); + rgb48_to(&src, true, ColorMatrix::Bt709, &mut sink).unwrap(); + + // Output must be host-native intended values. On a BE host with a + // regressed pre-swap (caller swaps, kernel swaps again) this would be + // byte-swapped relative to `intended`. + assert_eq!( + rgb_u16_out, intended, + "Rgb48 sinker LE-encoded plane decoded incorrectly (BE-contract regression)" + ); +} From 9b6521bc6c52ddc542770570b3c2b820f3f4e3d9 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 10:45:00 +1200 Subject: [PATCH 5/6] test(be-tier8): gate LE-fixture dispatcher tests on cfg(target_endian = "little") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI miri-sb / miri-tb on PR #87 fail on s390x and powerpc64 (BE host) because ~25 dispatcher smoke tests in `src/row/dispatch/{ayuv64, packed_rgb_16bit}.rs` build host-native `Vec` fixtures and call the new LE-only wrappers (`*_endian::` / `be_input = false`). Those wrappers forward to the scalar kernel's `u16::from_le` load, which on a BE host swaps bytes and corrupts the fixture before the conversion math runs — so the test's host-native value is no longer what the kernel sees and the assertions break. Apply the same fix as PR #82 8f2e329, PR #83 56342c0, and PR #85 57d9064: gate each affected test with `#[cfg(target_endian = "little")]`. The LE wrapper is intentionally LE-only, and the dispatcher tests are smoke tests that simply route to the scalar fallback — not the right place to assert BE-host correctness. Tests that use only byte-symmetric values (`0x0000`, `0xFFFF`, `0x1111`, `0x2222`, `0x3333`, ...) or that discard the only non-symmetric u16 (e.g. an alpha that the RGB output drops before observation) are host-endian-invariant and left ungated. That covers `rgb48_dispatcher_to_rgb_scalar_path`, `bgr48_dispatcher_to_rgb_u16_ scalar_path`, `bgr48_dispatcher_to_rgba_u16_scalar_path`, `rgba64_dispatcher_to_rgb_u16_scalar_path`, and `bgra64_dispatcher_to_rgb_u16_scalar_path` (5 of the 28 packed-RGB dispatcher tests stay open). BE-host correctness of the underlying scalar / SIMD kernels is already validated by the per-arch BE parity tests added in tier 5–7, which build fixtures via `to_le_bytes` / `to_be_bytes` (see PR #86's b7fb9d3 and 6924907) and so are host-endian-invariant by construction. Tests gated: - `src/row/dispatch/ayuv64.rs`: `ayuv64_dispatchers_route_with_simd_ false`, `ayuv64_be_and_le_dispatchers_agree` (2 tests). - `src/row/dispatch/packed_rgb_16bit.rs`: 23 of the 28 `*_dispatcher_to_*_scalar_path` smoke tests (the 5 listed above remain ungated). Verification: - `cargo test --target aarch64-apple-darwin --lib` — 2301 passed. - `cargo build --target x86_64-apple-darwin --tests` — clean. - `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` — clean. - `cargo build --no-default-features` — clean. - `cargo fmt --check` — clean. - `cargo clippy --all-targets --all-features -- -D warnings` — clean. - `cargo check --target s390x-unknown-linux-gnu --lib` — clean (BE-host smoke). - `cargo clippy --target s390x-unknown-linux-gnu --lib --all-features` — clean (no dead-code warnings from the gates). No kernel code touched (correctness already approved on codex pass 3); no `#[allow(...)]` suppressions; no `--no-verify`. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/ayuv64.rs | 16 +++++++++++++ src/row/dispatch/packed_rgb_16bit.rs | 36 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) diff --git a/src/row/dispatch/ayuv64.rs b/src/row/dispatch/ayuv64.rs index 0d757ee8..5aa934a2 100644 --- a/src/row/dispatch/ayuv64.rs +++ b/src/row/dispatch/ayuv64.rs @@ -672,6 +672,13 @@ mod tests { // ---- functional smoke --------------------------------------------------- + // LE-host gate: this test builds host-native `Vec` fixtures and calls + // the dispatchers with `be_input = false`, which forwards to the scalar + // kernel's `from_le` load. On BE hosts (s390x / powerpc64) `from_le` swaps + // bytes, so the host-native fixture is corrupted before the math runs and + // the assertions break. BE-host correctness is covered by the per-arch BE + // parity tests that build fixtures via `to_le_bytes` / `to_be_bytes`. + #[cfg(target_endian = "little")] #[test] fn ayuv64_dispatchers_route_with_simd_false() { // Limited-range BT.709: Y=60160 = 235*256 is limited-range white; @@ -761,6 +768,15 @@ mod tests { } } + // LE-host gate: the LE side uses `solid_ayuv64` (host-native) with + // `be_input = false` (→ `from_le`); the BE side uses `pack_ayuv64_be` + // (`swap_bytes` of host-native) with `be_input = true` (→ `from_be`). + // Both encodings are LE-host-correct only — on BE host the byte order in + // memory does not match what the wrappers decode, so the test must be + // pinned to little-endian. Cross-endian agreement on BE host is verified + // by the per-arch BE parity tests that construct fixtures via + // `to_le_bytes` / `to_be_bytes`. + #[cfg(target_endian = "little")] #[test] fn ayuv64_be_and_le_dispatchers_agree() { // BE-encoded data decoded with be_input=true must produce the same diff --git a/src/row/dispatch/packed_rgb_16bit.rs b/src/row/dispatch/packed_rgb_16bit.rs index d3944dbf..8dcdc8b6 100644 --- a/src/row/dispatch/packed_rgb_16bit.rs +++ b/src/row/dispatch/packed_rgb_16bit.rs @@ -1490,6 +1490,19 @@ mod tests { //! //! Each dispatcher's scalar fallback is exercised via `use_simd = false`. //! Overflow-guard tests are gated on 32-bit targets where `usize` is 32 bits. + //! + //! Many tests in this module build host-native `Vec` fixtures and + //! call the LE-only `*_endian::` wrappers, which apply + //! `u16::from_le` to each element. On big-endian hosts (s390x / + //! powerpc64) `from_le` swaps bytes, corrupting the fixture before the + //! conversion math runs. Such tests are gated with + //! `#[cfg(target_endian = "little")]`. Tests that use only + //! byte-symmetric values (`0x0000`, `0xFFFF`, `0x1111`, `0x2222`, + //! `0x3333`, ...) or that discard the only non-symmetric u16 (e.g. an + //! alpha that is dropped on RGB output) are host-endian-invariant and + //! left ungated. BE-host correctness of the underlying kernels is + //! covered by the per-arch BE parity tests that construct fixtures via + //! `to_le_bytes` / `to_be_bytes`. use super::*; // ---- helpers ------------------------------------------------------------- @@ -1518,6 +1531,7 @@ mod tests { ); } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgba_scalar_path() { let src = solid_rgb48(4, 0x1200); @@ -1529,6 +1543,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgb_u16_scalar_path() { let src = solid_rgb48(4, 0xABCD); @@ -1540,6 +1555,7 @@ mod tests { ); } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_rgba_u16_scalar_path() { let src = solid_rgb48(4, 0x1234); @@ -1551,6 +1567,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_luma_scalar_path() { // All-white Rgb48 (all channels = 0xFF00) → near-white luma in full-range BT.709. @@ -1571,6 +1588,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgb48(4, 0xFF00); @@ -1593,6 +1611,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgb48_dispatcher_to_hsv_scalar_path() { // Pure red: R=0xFF00, G=0, B=0 → H=0, S=255, V≈255 in OpenCV encoding. @@ -1609,6 +1628,7 @@ mod tests { // ---- Bgr48 --------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_rgb_scalar_path() { // Bgr48 pixel [B=0x1100, G=0x2200, R=0x3300] → rgb [R=0x33, G=0x22, B=0x11]. @@ -1620,6 +1640,7 @@ mod tests { assert_eq!(rgb[2], 0x11, "B"); } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_rgba_scalar_path() { let src = [0x1100u16, 0x2200, 0x3300]; @@ -1648,6 +1669,7 @@ mod tests { assert_eq!(rgba_u16[3], 0xFFFF, "alpha forced to 0xFFFF"); } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_luma_scalar_path() { let src = solid_rgb48(4, 0xFF00); // all channels = 0xFF00 @@ -1667,6 +1689,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgb48(4, 0xFF00); @@ -1686,6 +1709,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgr48_dispatcher_to_hsv_scalar_path() { // Pure blue in Bgr48 layout: B=0xFF00, G=0, R=0. @@ -1707,6 +1731,7 @@ mod tests { // ---- Rgba64 -------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgb_scalar_path() { // Source alpha should be dropped; R/G/B narrowed. @@ -1718,6 +1743,7 @@ mod tests { assert_eq!(rgb[2], 0x33, "B"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgba_scalar_path() { // Source alpha 0xABCD → 0xAB after >> 8. @@ -1737,6 +1763,7 @@ mod tests { assert_eq!(rgb_u16[2], 0x3333, "B"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_rgba_u16_scalar_path() { // Identity copy; source alpha preserved. @@ -1747,6 +1774,7 @@ mod tests { assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_luma_scalar_path() { // All-white Rgba64 (alpha irrelevant for luma path). @@ -1767,6 +1795,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgba64(4, 0xFF00); @@ -1789,6 +1818,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn rgba64_dispatcher_to_hsv_scalar_path() { // Pure green Rgba64: R=0, G=0xFF00, B=0, A=anything → H=60, S=255, V≈255. @@ -1809,6 +1839,7 @@ mod tests { // ---- Bgra64 -------------------------------------------------------------- + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgb_scalar_path() { // Bgra64: B=0x1100, G=0x2200, R=0x3300, A=0xDEAD → RGB [R=0x33, G=0x22, B=0x11]. @@ -1820,6 +1851,7 @@ mod tests { assert_eq!(rgb[2], 0x11, "B"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgba_scalar_path() { // Source alpha 0xABCD → 0xAB after >> 8; channels swapped. @@ -1840,6 +1872,7 @@ mod tests { assert_eq!(rgb_u16[2], 0x1111, "B (from position 0)"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_rgba_u16_scalar_path() { let src = [0x1111u16, 0x2222, 0x3333, 0xABCD]; // B, G, R, A @@ -1849,6 +1882,7 @@ mod tests { assert_eq!(rgba_u16[3], 0xABCD, "source alpha preserved"); } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_luma_scalar_path() { let src = solid_rgba64(4, 0xFF00); @@ -1868,6 +1902,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_luma_u16_scalar_path() { let src = solid_rgba64(4, 0xFF00); @@ -1890,6 +1925,7 @@ mod tests { } } + #[cfg(target_endian = "little")] #[test] fn bgra64_dispatcher_to_hsv_scalar_path() { // Pure blue in Bgra64 layout: B=0xFF00, G=0, R=0, A=any. From cb53e8676441d6adb9bb6fc63ecb6c0ce6191769 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 10:57:47 +1200 Subject: [PATCH 6/6] test(be-tier8): gate AYUV64 dispatcher test helpers on cfg(target_endian = "little") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI miri-tb-s390x failed with 4 dead-code errors in `src/row/dispatch/ayuv64.rs`: error: function `pack_ayuv64` is never used error: function `pack_ayuv64_be` is never used error: function `solid_ayuv64` is never used error: function `solid_ayuv64_be` is never used These helpers are consumed only by the AYUV64 dispatcher tests gated in `9b6521b` on `target_endian = "little"`. On BE hosts the gated tests are skipped, so the helpers fall out as dead code → `-D warnings` makes the miri job fail to compile. Fix: gate the 4 helpers with the same `cfg(target_endian = "little")` so they're co-compiled with their consumers. Verified: - cargo check --target s390x-unknown-linux-gnu --lib: clean - cargo test --target aarch64-apple-darwin --lib: 2301 pass - cargo fmt --check: clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/ayuv64.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/row/dispatch/ayuv64.rs b/src/row/dispatch/ayuv64.rs index 5aa934a2..c598dc67 100644 --- a/src/row/dispatch/ayuv64.rs +++ b/src/row/dispatch/ayuv64.rs @@ -567,11 +567,19 @@ mod tests { /// Pack one AYUV64 pixel from explicit A / Y / U / V samples (16-bit /// native, no shift required). + /// + /// Helpers below are consumed only by the LE-host-gated tests in this + /// module (see the gating policy at the top of `mod tests`); on BE + /// hosts (s390x / powerpc64) those tests are skipped, so the helpers + /// would appear unused under `-D warnings`. Gate the helpers with the + /// same `target_endian = "little"` cfg. + #[cfg(target_endian = "little")] fn pack_ayuv64(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] { [a, y, u, v] } /// Pack one AYUV64 pixel in big-endian wire format. + #[cfg(target_endian = "little")] fn pack_ayuv64_be(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] { [ a.swap_bytes(), @@ -584,12 +592,14 @@ mod tests { /// Build a `Vec` AYUV64 row of `width` pixels with neutral /// chroma (U=V=32768) and the given Y / alpha values. Any positive /// width is valid (4:4:4, no chroma subsampling). + #[cfg(target_endian = "little")] fn solid_ayuv64(width: usize, y: u16, a: u16) -> std::vec::Vec { let quad = pack_ayuv64(a, y, 32768, 32768); (0..width).flat_map(|_| quad).collect() } /// Build a `Vec` AYUV64 row in big-endian wire format. + #[cfg(target_endian = "little")] fn solid_ayuv64_be(width: usize, y: u16, a: u16) -> std::vec::Vec { let quad = pack_ayuv64_be(a, y, 32768, 32768); (0..width).flat_map(|_| quad).collect()