diff --git a/src/row/arch/neon/tests/v210.rs b/src/row/arch/neon/tests/v210.rs index b82bdab4..be979537 100644 --- a/src/row/arch/neon/tests/v210.rs +++ b/src/row/arch/neon/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v210→luma u16 diverges (width={width})"); } @@ -213,7 +213,7 @@ fn neon_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "neon v210 luma reorder bug"); @@ -222,9 +222,15 @@ fn neon_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/neon/tests/y216.rs b/src/row/arch/neon/tests/y216.rs index 8d379a2d..1a982f4d 100644 --- a/src/row/arch/neon/tests/y216.rs +++ b/src/row/arch/neon/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -32,9 +32,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -48,9 +48,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y216→luma diverges (width={width})"); } @@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y216→luma u16 diverges (width={width})"); } @@ -142,7 +142,7 @@ fn neon_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "NEON y216 luma_u16 reorder bug"); @@ -151,9 +151,15 @@ fn neon_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/neon/tests/y2xx.rs b/src/row/arch/neon/tests/y2xx.rs index 892e0d14..d12a51d4 100644 --- a/src/row/arch/neon/tests/y2xx.rs +++ b/src/row/arch/neon/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -95,9 +95,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -109,9 +109,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -123,9 +123,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -137,9 +139,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -151,9 +155,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y2xx<{BITS}>→luma diverges (width={width})"); } @@ -162,9 +166,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y2xx<{BITS}>→luma u16 diverges (width={width})"); } @@ -225,15 +229,15 @@ fn neon_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "NEON y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -241,7 +245,7 @@ fn neon_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -253,17 +257,17 @@ fn neon_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "NEON y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "NEON y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/neon/v210.rs b/src/row/arch/neon/v210.rs index 0d9f9748..ba406d7a 100644 --- a/src/row/arch/neon/v210.rs +++ b/src/row/arch/neon/v210.rs @@ -18,34 +18,9 @@ use core::arch::aarch64::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; -/// Loads 16 bytes as 4 × `u32` in **little-endian** order regardless -/// of host endianness. v210 words are documented LE; on big-endian -/// aarch64 (rare — `aarch64_be-*` custom targets) the plain -/// `vld1q_u32` would put bytes in reversed positions within each -/// lane and corrupt every subsequent shift-and-mask. Mirrors the -/// `x2_load_le_u32x4` helper in `packed_rgb.rs` (X2RGB10 / X2BGR10 -/// share the same LE-word constraint). Defining a local helper -/// avoids cross-file visibility hassle since `x2_load_le_u32x4` is -/// `pub(super) fn` but not re-exported via the mod's glob. -/// -/// # Safety -/// -/// Caller must ensure `ptr` has at least 16 bytes readable. -#[inline(always)] -unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t { - unsafe { - let raw = vld1q_u32(ptr as *const u32); - if cfg!(target_endian = "big") { - vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(raw))) - } else { - raw - } - } -} - /// Unpacks one 16-byte v210 word into three u16x8 vectors holding /// 10-bit samples in their low bits: /// - `y_vec`: lanes 0..6 = Y0..Y5 (lanes 6, 7 are don't-care). @@ -65,10 +40,12 @@ unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t { /// Caller must ensure `ptr` has at least 16 bytes readable. #[inline] #[target_feature(enable = "neon")] -unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint16x8_t) { +unsafe fn unpack_v210_word_neon( + ptr: *const u8, +) -> (uint16x8_t, uint16x8_t, uint16x8_t) { // SAFETY: caller obligation — `ptr` has 16 bytes readable. unsafe { - let words = v210_load_le_u32x4(ptr); + let words = load_endian_u32x4::(ptr); let mask10 = vdupq_n_u32(0x3FF); let low10 = vandq_u32(words, mask10); let mid10 = vandq_u32(vshrq_n_u32::<10>(words), mask10); @@ -132,12 +109,12 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint } } -/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA`: -/// `false` writes 3 bytes per pixel, `true` writes 4 bytes per -/// pixel with `α = 0xFF`. Output bit depth is u8 (downshifted from +/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA` and `BE`. +/// `BE = true` selects big-endian u32 word decoding (each 32-bit packed +/// word stored BE on the wire). Output bit depth is u8 (downshifted from /// the native 10-bit Q15 pipeline via `range_params_n::<10, 8>`). /// -/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::` for +/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::` for /// every input. /// /// # Safety @@ -148,7 +125,7 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -185,7 +162,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = vdupq_n_s32(coeffs.b_v()); for w in 0..full_words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); let y_i16 = vreinterpretq_s16_u16(y_vec); @@ -255,14 +232,21 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } /// NEON v210 → packed `u16` RGB / RGBA at native 10-bit depth -/// (low-bit-packed). Byte-identical to -/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::`. +/// (low-bit-packed). `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to +/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -272,7 +256,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -309,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = vdupq_n_s32(coeffs.b_v()); for w in 0..full_words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); let y_i16 = vreinterpretq_s16_u16(y_vec); let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); @@ -362,7 +346,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -375,7 +359,8 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// NEON v210 → 8-bit luma. Y values are downshifted from 10-bit to /// 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely. -/// Byte-identical to `scalar::v210_to_luma_row`. +/// `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to `scalar::v210_to_luma_row::`. /// /// # Safety /// @@ -385,7 +370,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let full_words = width / 6; @@ -395,7 +384,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..full_words { - let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8. let y_u8 = vqmovn_u16(vshrq_n_u16::<2>(y_vec)); // Store 6 of the 8 lanes: stack buffer + copy_from_slice. @@ -408,14 +397,15 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } /// NEON v210 → native-depth `u16` luma (low-bit-packed). Each output /// `u16` carries the source's 10-bit Y value in its low 10 bits. -/// Byte-identical to `scalar::v210_to_luma_u16_row`. +/// `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to `scalar::v210_to_luma_u16_row::`. /// /// # Safety /// @@ -425,7 +415,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let full_words = width / 6; @@ -435,7 +429,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..full_words { - let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; vst1q_u16(tmp.as_mut_ptr(), y_vec); @@ -446,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/neon/y216.rs b/src/row/arch/neon/y216.rs index 8aaa8664..01a26e62 100644 --- a/src/row/arch/neon/y216.rs +++ b/src/row/arch/neon/y216.rs @@ -32,8 +32,9 @@ use crate::{ColorMatrix, row::scalar}; // ---- u8 output (i32 chroma, 16 px/iter) --------------------------------- /// NEON Y216 → packed u8 RGB or RGBA. +/// `BE = true` bypasses NEON and uses scalar for the full row. /// -/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::`. +/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::`. /// /// # Safety /// @@ -43,7 +44,7 @@ use crate::{ColorMatrix, row::scalar}; /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -61,128 +62,137 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = vdupq_n_s32(RND); - // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off. - // Y values are full u16 (0..65535), so we must use u16-aware widening - // rather than reinterpreting as i16 (which would corrupt values > 32767). - let y_off_v = vdupq_n_s32(y_off); - let y_scale_v = vdupq_n_s32(y_scale); - let c_scale_v = vdupq_n_s32(c_scale); - let bias_v = vdupq_n_s16(bias as i16); - let cru = vdupq_n_s32(coeffs.r_u()); - let crv = vdupq_n_s32(coeffs.r_v()); - let cgu = vdupq_n_s32(coeffs.g_u()); - let cgv = vdupq_n_s32(coeffs.g_v()); - let cbu = vdupq_n_s32(coeffs.b_u()); - let cbv = vdupq_n_s32(coeffs.b_v()); - + // BE=true: bypass NEON; scalar handles full row below. let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16). - // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - - // Extract U and V from interleaved chroma via vuzp. - // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3] - // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid. - // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid. - let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1); - let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1); - let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1); - let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1); - - // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so - // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16. - let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v); - let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v); - let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v); - let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v); - - // Widen to i32x4 for Q15 multiply. - // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs - // discarded by vzip1q_s16 below which only uses lanes 0..3). - let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16)); - let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16)); - let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16)); - let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16)); - let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16)); - let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16)); - let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16)); - let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16)); - - // Q15 chroma scale. - let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v)); - let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v)); - let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v)); - let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v)); - let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v)); - let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v)); - let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v)); - let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v)); - - // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi; - // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care). - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - - // Duplicate chroma into Y-pair slots (4:2:2): - // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3] - let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo); - let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo); - let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo); - let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi); - let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi); - let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi); - - // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies - // (y - y_off) * y_scale Q15, narrows to i16x8. Avoids the i16 - // overflow that `scale_y` would cause for Y values > 32767. - let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v); - - // Saturating add; narrow to u8x8. - let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo)); - let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo)); - let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo)); - let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi)); - let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi)); - let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi)); - - if ALPHA { - let alpha = vdup_n_u8(0xFF); - vst4_u8( - out.as_mut_ptr().add(x * 4), - uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha), - ); - vst4_u8( - out.as_mut_ptr().add(x * 4 + 32), - uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha), - ); - } else { - vst3_u8( - out.as_mut_ptr().add(x * 3), - uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8), - ); - vst3_u8( - out.as_mut_ptr().add(x * 3 + 24), - uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8), - ); + if !BE { + let rnd_v = vdupq_n_s32(RND); + // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off. + // Y values are full u16 (0..65535), so we must use u16-aware widening + // rather than reinterpreting as i16 (which would corrupt values > 32767). + let y_off_v = vdupq_n_s32(y_off); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 16 <= width { + // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16). + // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + + // Extract U and V from interleaved chroma via vuzp. + // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3] + // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid. + // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid. + let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1); + let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1); + let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1); + let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1); + + // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so + // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16. + let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v); + let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v); + let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v); + let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v); + + // Widen to i32x4 for Q15 multiply. + // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs + // discarded by vzip1q_s16 below which only uses lanes 0..3). + let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16)); + let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16)); + let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16)); + let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16)); + let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16)); + let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16)); + let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16)); + let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16)); + + // Q15 chroma scale. + let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v)); + let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v)); + let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v)); + let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v)); + let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v)); + let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v)); + let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v)); + let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v)); + + // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi; + // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care). + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + + // Duplicate chroma into Y-pair slots (4:2:2): + // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3] + let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo); + let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo); + let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo); + let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi); + let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi); + let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi); + + // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies + // (y - y_off) * y_scale Q15, narrows to i16x8. Avoids the i16 + // overflow that `scale_y` would cause for Y values > 32767. + let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v); + + // Saturating add; narrow to u8x8. + let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo)); + let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo)); + let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo)); + let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi)); + let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi)); + let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi)); + + if ALPHA { + let alpha = vdup_n_u8(0xFF); + vst4_u8( + out.as_mut_ptr().add(x * 4), + uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha), + ); + vst4_u8( + out.as_mut_ptr().add(x * 4 + 32), + uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha), + ); + } else { + vst3_u8( + out.as_mut_ptr().add(x * 3), + uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8), + ); + vst3_u8( + out.as_mut_ptr().add(x * 3 + 24), + uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8), + ); + } + + x += 16; } + } // end if !BE - x += 16; - } - - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -192,7 +202,8 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// NEON Y216 → packed native-depth u16 RGB or RGBA. /// /// Uses i64 chroma (`chroma_i64x4`) to avoid overflow at 16-bit scales. -/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::`. +/// `BE = true` bypasses NEON and uses scalar for the full row. +/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::`. /// /// ## Pipeline /// @@ -211,7 +222,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -229,180 +240,183 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND: i32 = 1 << 14; unsafe { - let alpha_u16 = vdupq_n_u16(0xFFFF); - let rnd_v = vdupq_n_s32(RND); - let rnd64 = vdupq_n_s64(RND as i64); - let y_off_v = vdupq_n_s32(y_off); - let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32 - let c_scale_v = vdupq_n_s32(c_scale); - let bias_v = vdupq_n_s32(bias); - let cru = vdupq_n_s32(coeffs.r_u()); - let crv = vdupq_n_s32(coeffs.r_v()); - let cgu = vdupq_n_s32(coeffs.g_u()); - let cgv = vdupq_n_s32(coeffs.g_v()); - let cbu = vdupq_n_s32(coeffs.b_u()); - let cbv = vdupq_n_s32(coeffs.b_v()); - + // BE=true: bypass NEON; scalar handles full row below. let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - - // Extract U/V from chroma via vuzp. - // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid. - let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1); - let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1); - let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1); - let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1); - - // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d. - let u_d_lo = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))), - bias_v, + if !BE { + let alpha_u16 = vdupq_n_u16(0xFFFF); + let rnd_v = vdupq_n_s32(RND); + let rnd64 = vdupq_n_s64(RND as i64); + let y_off_v = vdupq_n_s32(y_off); + let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32 + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s32(bias); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 16 <= width { + // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + + // Extract U/V from chroma via vuzp. + // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid. + let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1); + let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1); + let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1); + let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1); + + // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d. + let u_d_lo = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let v_d_lo = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))), - bias_v, + rnd_v, + )); + let v_d_lo = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let u_d_hi = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))), - bias_v, + rnd_v, + )); + let u_d_hi = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let v_d_hi = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))), - bias_v, + rnd_v, + )); + let v_d_hi = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - - // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow). - let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64); - let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64); - let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64); - let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64); - let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64); - let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64); - - // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). - // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3 - // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7 - let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo); - let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo); - let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo); - let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo); - let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo); - let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo); - let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi); - let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi); - let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi); - let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi); - let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi); - let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi); - - // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range. - // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64. - // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15. - let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0))); - let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0))); - let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0))); - let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0))); - let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64); - let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64); - let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64); - let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64); - - // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]). - // - // Alignment: - // ys_lo_0 = [Y0,Y1,Y2,Y3] r_cd_lo0 = [c0,c0,c1,c1] → pixels 0..3 - // ys_lo_1 = [Y4,Y5,Y6,Y7] r_cd_lo1 = [c2,c2,c3,c3] → pixels 4..7 - // ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5] → pixels 8..11 - // ys_hi_1 = [Y12..Y15] r_cd_hi1 = [c6,c6,c7,c7] → pixels 12..15 - // - // vcombine_u16(A, B) packs two u16x4 into one u16x8. - let r_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)), - ); - let g_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)), - ); - let b_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)), - ); - // hi group (Y8..Y15) - let r_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)), - ); - let g_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)), - ); - let b_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), - ); - - // Each u16x8 covers 8 pixels. Two stores per format (lo + hi). - // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes). - // Offset for lo: x*4 u16. Offset for hi: x*4+32 u16. - // For RGB: each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes). - // Offset for lo: x*3 u16. Offset for hi: x*3+24 u16. - if ALPHA { - vst4q_u16( - out.as_mut_ptr().add(x * 4), - uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + rnd_v, + )); + + // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow). + let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64); + let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64); + let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64); + let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64); + let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64); + let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64); + + // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). + // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3 + // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7 + let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo); + let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo); + let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo); + let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo); + let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo); + let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo); + let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi); + let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi); + let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi); + let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi); + let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi); + let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi); + + // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range. + // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64. + // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15. + let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0))); + let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0))); + let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0))); + let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0))); + let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64); + let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64); + let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64); + let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64); + + // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]). + // + // Alignment: + // ys_lo_0 = [Y0,Y1,Y2,Y3] r_cd_lo0 = [c0,c0,c1,c1] → pixels 0..3 + // ys_lo_1 = [Y4,Y5,Y6,Y7] r_cd_lo1 = [c2,c2,c3,c3] → pixels 4..7 + // ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5] → pixels 8..11 + // ys_hi_1 = [Y12..Y15] r_cd_hi1 = [c6,c6,c7,c7] → pixels 12..15 + // + // vcombine_u16(A, B) packs two u16x4 into one u16x8. + let r_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)), ); - vst4q_u16( - out.as_mut_ptr().add(x * 4 + 32), - uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + let g_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)), ); - } else { - vst3q_u16( - out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + let b_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)), ); - vst3q_u16( - out.as_mut_ptr().add(x * 3 + 24), - uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + // hi group (Y8..Y15) + let r_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)), + ); + let g_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)), + ); + let b_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), ); - } - x += 16; - } + // Each u16x8 covers 8 pixels. Two stores per format (lo + hi). + // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes). + // Offset for lo: x*4 u16. Offset for hi: x*4+32 u16. + // For RGB: each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes). + // Offset for lo: x*3 u16. Offset for hi: x*3+24 u16. + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + ); + vst4q_u16( + out.as_mut_ptr().add(x * 4 + 32), + uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + ); + vst3q_u16( + out.as_mut_ptr().add(x * 3 + 24), + uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + ); + } + + x += 16; + } + } // end if !BE - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -416,8 +430,9 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // ---- Luma u8 (16 px/iter) ----------------------------------------------- /// NEON Y216 → u8 luma. Extracts Y via `>> 8`. +/// `BE = true` bypasses NEON and uses scalar. /// -/// Byte-identical to `scalar::y216_to_luma_row`. +/// Byte-identical to `scalar::y216_to_luma_row::`. /// /// # Safety /// @@ -427,29 +442,35 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - // >> 8 narrows u16 → u8 (high byte of each Y sample). - let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0); - let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0); - vst1_u8(out.as_mut_ptr().add(x), y_lo_u8); - vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8); - x += 16; + if !BE { + while x + 16 <= width { + // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + // >> 8 narrows u16 → u8 (high byte of each Y sample). + let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0); + let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0); + vst1_u8(out.as_mut_ptr().add(x), y_lo_u8); + vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -457,8 +478,9 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // ---- Luma u16 (16 px/iter) ---------------------------------------------- /// NEON Y216 → u16 luma. Direct copy of Y samples (no shift). +/// `BE = true` bypasses NEON and uses scalar. /// -/// Byte-identical to `scalar::y216_to_luma_u16_row`. +/// Byte-identical to `scalar::y216_to_luma_u16_row::`. /// /// # Safety /// @@ -468,26 +490,32 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { let mut x = 0usize; - while x + 16 <= width { - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - // Direct copy — Y samples are already full 16-bit (no shift needed). - vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0); - vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0); - x += 16; + if !BE { + while x + 16 <= width { + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + // Direct copy — Y samples are already full 16-bit (no shift needed). + vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0); + vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/neon/y2xx.rs b/src/row/arch/neon/y2xx.rs index 0c02365f..72920362 100644 --- a/src/row/arch/neon/y2xx.rs +++ b/src/row/arch/neon/y2xx.rs @@ -83,11 +83,12 @@ unsafe fn unpack_y2xx_8px_neon( } /// NEON Y2xx → packed RGB / RGBA u8. Const‑generic over -/// `BITS ∈ {10, 12}` and `ALPHA ∈ {false, true}`. Output bit depth is -/// u8 (downshifted from the native BITS Q15 pipeline via -/// `range_params_n::`). +/// `BITS ∈ {10, 12}`, `ALPHA ∈ {false, true}`, and `BE ∈ {false, true}`. +/// `BE = true` selects big-endian u16 decoding for the input samples. +/// When `BE = true` the SIMD path is bypassed and the scalar kernel +/// handles the full row (the NEON loop only handles native-endian data). /// -/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::` +/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::` /// for every input. /// /// # Safety @@ -98,7 +99,11 @@ unsafe fn unpack_y2xx_8px_neon( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -126,86 +131,90 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. 8 valid lanes per channel. - let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup)); - let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup)); - let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup)); - - if ALPHA { - let alpha = vdup_n_u8(0xFF); - vst4_u8( - out.as_mut_ptr().add(x * 4), - uint8x8x4_t(r_u8, g_u8, b_u8, alpha), - ); - } else { - vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8)); + if !BE { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = vreinterpretq_s16_u16(y_vec); + + // Subtract chroma bias (e.g. 512 for 10‑bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + // Widen 8‑lane i16 chroma to two i32x4 halves for the Q15 + // multiplies. Only lanes 0..3 of `_lo` are valid; `_hi` is + // entirely don't-care (duplicate of `_lo`). We feed both + // halves through `chroma_i16x8` to recycle the helper exactly; + // the don't-care output lanes are discarded by `vzip1q_s16` + // below (which only consumes lanes 0..3). + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + // 8‑lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `vzip1q_s16` so lanes 0..7 of `r_dup` align with Y0..Y7. + // `vzip1q_s16` interleaves the low 4 lanes of each operand: + // [c0, c0, c1, c1, c2, c2, c3, c3] + let r_dup = vzip1q_s16(r_chroma, r_chroma); + let g_dup = vzip1q_s16(g_chroma, g_chroma); + let b_dup = vzip1q_s16(b_chroma, b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. 8 valid lanes per channel. + let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup)); + let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup)); + let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup)); + + if ALPHA { + let alpha = vdup_n_u8(0xFF); + vst4_u8( + out.as_mut_ptr().add(x * 4), + uint8x8x4_t(r_u8, g_u8, b_u8, alpha), + ); + } else { + vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8)); + } + + x += 8; } - - x += 8; } - // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // Scalar tail — remaining < 8 pixels (always even per 4:2:2), or + // full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -218,10 +227,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row`. +/// `scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -231,7 +241,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -257,71 +271,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -335,9 +352,9 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row> (BITS - 8)` after the `>> (16 - BITS)` MSB‑alignment, i.e. /// a single `>> 8` from the raw u16 sample. Bypasses the YUV → RGB -/// pipeline entirely. +/// pipeline entirely. `BE = true` bypasses NEON and uses scalar. /// -/// Byte‑identical to `scalar::y2xx_n_to_luma_row::`. +/// Byte‑identical to `scalar::y2xx_n_to_luma_row::`. /// /// # Safety /// @@ -347,7 +364,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -365,29 +382,32 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - while x + 8 <= width { - // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples - // (still MSB‑aligned at BITS ≤ 12, low bits zero). - let pair = vld2q_u16(packed.as_ptr().add(x * 2)); - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` - // for any BITS ∈ {10, 12} — the constant fold gives the same - // result whether we shift in two stages or one. - let y_u8 = vshrn_n_u16::<8>(pair.0); - vst1_u8(luma_out.as_mut_ptr().add(x), y_u8); - x += 8; + if !BE { + while x + 8 <= width { + // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples + // (still MSB‑aligned at BITS ≤ 12, low bits zero). + let pair = vld2q_u16(packed.as_ptr().add(x * 2)); + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` + // for any BITS ∈ {10, 12} — the constant fold gives the same + // result whether we shift in two stages or one. + let y_u8 = vshrn_n_u16::<8>(pair.0); + vst1_u8(luma_out.as_mut_ptr().add(x), y_u8); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } /// NEON Y2xx → native‑depth `u16` luma (low‑bit‑packed). Each output /// `u16` carries the source's BITS-bit Y value in its low BITS bits. -/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::`. +/// `BE = true` bypasses NEON and uses scalar. +/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::`. /// /// # Safety /// @@ -397,7 +417,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -414,21 +434,23 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); let mut x = 0usize; - while x + 8 <= width { - let pair = vld2q_u16(packed.as_ptr().add(x * 2)); - // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples - // into low‑bit‑packed form for the native‑depth u16 output. - let y_low = vshlq_u16(pair.0, shr_count); - vst1q_u16(luma_out.as_mut_ptr().add(x), y_low); - x += 8; + if !BE { + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + while x + 8 <= width { + let pair = vld2q_u16(packed.as_ptr().add(x * 2)); + // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples + // into low‑bit‑packed form for the native‑depth u16 output. + let y_low = vshlq_u16(pair.0, shr_count); + vst1q_u16(luma_out.as_mut_ptr().add(x), y_low); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/tests/v210.rs b/src/row/arch/wasm_simd128/tests/v210.rs index ac7455c2..d4d51116 100644 --- a/src/row/arch/wasm_simd128/tests/v210.rs +++ b/src/row/arch/wasm_simd128/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v210→luma u16 diverges (width={width})"); } @@ -227,7 +227,7 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma_out = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma_out, W); + v210_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -239,9 +239,15 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/tests/y216.rs b/src/row/arch/wasm_simd128/tests/y216.rs index 8441d72c..034f029b 100644 --- a/src/row/arch/wasm_simd128/tests/y216.rs +++ b/src/row/arch/wasm_simd128/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -29,9 +29,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -43,9 +43,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -57,9 +57,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -71,9 +71,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y216→luma u8 diverges (width={width})"); } @@ -82,9 +82,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y216→luma u16 diverges (width={width})"); } @@ -183,7 +183,7 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -195,9 +195,15 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/tests/y2xx.rs b/src/row/arch/wasm_simd128/tests/y2xx.rs index 08a484ce..ad31d2f1 100644 --- a/src/row/arch/wasm_simd128/tests/y2xx.rs +++ b/src/row/arch/wasm_simd128/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -101,9 +101,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -115,9 +115,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -129,9 +129,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -143,9 +145,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -157,9 +161,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -168,9 +172,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -251,15 +255,15 @@ fn wasm_simd128_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "simd128 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -267,7 +271,7 @@ fn wasm_simd128_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -282,17 +286,17 @@ fn wasm_simd128_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "simd128 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "simd128 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/wasm_simd128/v210.rs b/src/row/arch/wasm_simd128/v210.rs index dba59ca9..264ca1c4 100644 --- a/src/row/arch/wasm_simd128/v210.rs +++ b/src/row/arch/wasm_simd128/v210.rs @@ -16,7 +16,7 @@ use core::arch::wasm32::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks one 16-byte v210 word into three `v128` vectors holding @@ -45,11 +45,11 @@ use crate::{ColorMatrix, row::scalar}; /// wasm). #[inline] #[target_feature(enable = "simd128")] -unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { +unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { // SAFETY: caller obligation — `ptr` has 16 bytes readable; simd128 // is enabled at compile time. unsafe { - let words = v128_load(ptr.cast()); + let words = load_endian_u32x4::(ptr); let mask10 = i32x4_splat(0x3FF); let low10 = v128_and(words, mask10); let mid10 = v128_and(u32x4_shr(words, 10), mask10); @@ -146,7 +146,7 @@ unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -183,7 +183,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = i32x4_splat(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; @@ -270,7 +270,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -287,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -324,7 +330,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = i32x4_splat(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; let u_i16 = i16x8_sub(u_vec, bias_v); @@ -391,7 +397,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -414,7 +420,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -424,7 +434,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x16 via // saturating narrow (Y ≤ 1023 stays well inside [0, 255] post-shift). let y_shr = u16x8_shr(y_vec, 2); @@ -439,7 +449,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -456,7 +466,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -466,7 +480,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; v128_store(tmp.as_mut_ptr().cast(), y_vec); @@ -477,7 +491,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/y216.rs b/src/row/arch/wasm_simd128/y216.rs index 5beb78f2..7bdf6363 100644 --- a/src/row/arch/wasm_simd128/y216.rs +++ b/src/row/arch/wasm_simd128/y216.rs @@ -107,7 +107,7 @@ unsafe fn unpack_y216_8px_wasm(ptr: *const u16) -> (v128, v128, v128) { /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -124,102 +124,111 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = i32x4_splat(RND); - let y_off32_v = i32x4_splat(y_off); - let y_scale_v = i32x4_splat(y_scale); - let c_scale_v = i32x4_splat(c_scale); - // Bias = 32768 = 0x8000; as i16 this wraps to -32768. - // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly - // maps full-u16 chroma [0, 65535] to [-32768, 32767]. - let bias16_v = i16x8_splat(-32768i16); - let alpha_u8 = u8x16_splat(0xFF); - let cru = i32x4_splat(coeffs.r_u()); - let crv = i32x4_splat(coeffs.r_v()); - let cgu = i32x4_splat(coeffs.g_u()); - let cgv = i32x4_splat(coeffs.g_v()); - let cbu = i32x4_splat(coeffs.b_u()); - let cbv = i32x4_splat(coeffs.b_v()); - let mut x = 0usize; - // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15). - while x + 16 <= width { - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16)); - - // Chroma bias subtraction (wrapping trick for full-u16 range). - let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); - let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v); - let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); - let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v); - - // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid. - // Hi halves hold zeros (from the swizzle mask) — don't-care since - // `chroma_i16x8` discards lanes 4..7 after `dup_lo`. - let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16); - let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16); - let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16); - let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16); - let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16); - let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16); - let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16); - let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16); - - // Q15 chroma scale → i32x4 (scaled chroma deltas). - let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v)); - let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v)); - let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v)); - let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v)); - let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v)); - let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v)); - let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v)); - let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v)); - - // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care). - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - - // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample). - let r_dup_lo = dup_lo(r_chroma_lo); - let g_dup_lo = dup_lo(g_chroma_lo); - let b_dup_lo = dup_lo(b_chroma_lo); - let r_dup_hi = dup_lo(r_chroma_hi); - let g_dup_hi = dup_lo(g_chroma_hi); - let b_dup_hi = dup_lo(b_chroma_hi); - - // Y scale via unsigned widening (Y216 has full u16 range; i16 would - // overflow for Y > 32767). - let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v); - - // Saturating add → saturating narrow to u8x16. - let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo); - let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi); - let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo); - let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi); - let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo); - let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi); - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - if ALPHA { - write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = i32x4_splat(RND); + let y_off32_v = i32x4_splat(y_off); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + // Bias = 32768 = 0x8000; as i16 this wraps to -32768. + // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly + // maps full-u16 chroma [0, 65535] to [-32768, 32767]. + let bias16_v = i16x8_splat(-32768i16); + let alpha_u8 = u8x16_splat(0xFF); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15). + while x + 16 <= width { + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); + let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16)); + + // Chroma bias subtraction (wrapping trick for full-u16 range). + let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); + let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v); + let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); + let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v); + + // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid. + // Hi halves hold zeros (from the swizzle mask) — don't-care since + // `chroma_i16x8` discards lanes 4..7 after `dup_lo`. + let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16); + let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16); + let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16); + let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16); + let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16); + let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16); + let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16); + let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16); + + // Q15 chroma scale → i32x4 (scaled chroma deltas). + let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v)); + let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v)); + let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v)); + let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v)); + let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v)); + let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v)); + let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v)); + let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v)); + + // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care). + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + + // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample). + let r_dup_lo = dup_lo(r_chroma_lo); + let g_dup_lo = dup_lo(g_chroma_lo); + let b_dup_lo = dup_lo(b_chroma_lo); + let r_dup_hi = dup_lo(r_chroma_hi); + let g_dup_hi = dup_lo(g_chroma_hi); + let b_dup_hi = dup_lo(b_chroma_hi); + + // Y scale via unsigned widening (Y216 has full u16 range; i16 would + // overflow for Y > 32767). + let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v); + + // Saturating add → saturating narrow to u8x16. + let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo); + let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi); + let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo); + let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi); + let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo); + let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi); + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + x += 16; } - x += 16; } // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -237,7 +246,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -255,101 +264,104 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND_I32: i32 = 1 << 14; unsafe { - let alpha_u16 = u16x8_splat(0xFFFF); - let rnd_i64 = i64x2_splat(RND_I64); - let rnd_i32 = i32x4_splat(RND_I32); - let y_off32 = i32x4_splat(y_off); - let y_scale_i64 = i64x2_splat(y_scale as i64); - let c_scale_i32 = i32x4_splat(c_scale); - // Wrapping 0x8000 bias trick for full-u16 chroma. - let bias16 = i16x8_splat(-32768i16); - // Coefficients widened once to i64x2. - let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u())); - let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v())); - let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u())); - let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v())); - let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u())); - let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v())); - let mut x = 0usize; - // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs. - while x + 8 <= width { - let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); - - // Chroma bias (wrapping trick). - let u_i16 = i16x8_sub(u_vec, bias16); - let v_i16 = i16x8_sub(v_vec, bias16); - - // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares). - let u_i32 = i32x4_extend_low_i16x8(u_i16); - let v_i32 = i32x4_extend_low_i16x8(v_i16); - - // Q15 scale → 4 × i32 chroma deltas. - let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15); - let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15); - - // Widen to 2 × i64x2 for i64 chroma pipeline. - let u_d_lo = i64x2_extend_low_i32x4(u_d); - let u_d_hi = i64x2_extend_high_i32x4(u_d); - let v_d_lo = i64x2_extend_low_i32x4(v_d); - let v_d_hi = i64x2_extend_high_i32x4(v_d); - - let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64); - let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64); - let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64); - let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64); - let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64); - let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64); - - // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3]. - let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi); - let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi); - let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi); - - // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). - // chroma_dup_i32x4_u16([c0,c1,c2,c3]) → - // lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3] - let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32); - let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32); - let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32); - - // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64. - let y_lo_u32 = u32x4_extend_low_u16x8(y_vec); - let y_hi_u32 = u32x4_extend_high_u16x8(y_vec); - let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32); - let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32); - - let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64); - let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64); - - // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]). - let r_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, r_dup_lo), - i32x4_add(y_hi_scaled, r_dup_hi), - ); - let g_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, g_dup_lo), - i32x4_add(y_hi_scaled, g_dup_hi), - ); - let b_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, b_dup_lo), - i32x4_add(y_hi_scaled, b_dup_hi), - ); - - if ALPHA { - write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = u16x8_splat(0xFFFF); + let rnd_i64 = i64x2_splat(RND_I64); + let rnd_i32 = i32x4_splat(RND_I32); + let y_off32 = i32x4_splat(y_off); + let y_scale_i64 = i64x2_splat(y_scale as i64); + let c_scale_i32 = i32x4_splat(c_scale); + // Wrapping 0x8000 bias trick for full-u16 chroma. + let bias16 = i16x8_splat(-32768i16); + // Coefficients widened once to i64x2. + let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u())); + let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v())); + let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u())); + let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v())); + let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u())); + let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v())); + + // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs. + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); + + // Chroma bias (wrapping trick). + let u_i16 = i16x8_sub(u_vec, bias16); + let v_i16 = i16x8_sub(v_vec, bias16); + + // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares). + let u_i32 = i32x4_extend_low_i16x8(u_i16); + let v_i32 = i32x4_extend_low_i16x8(v_i16); + + // Q15 scale → 4 × i32 chroma deltas. + let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15); + let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15); + + // Widen to 2 × i64x2 for i64 chroma pipeline. + let u_d_lo = i64x2_extend_low_i32x4(u_d); + let u_d_hi = i64x2_extend_high_i32x4(u_d); + let v_d_lo = i64x2_extend_low_i32x4(v_d); + let v_d_hi = i64x2_extend_high_i32x4(v_d); + + let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64); + let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64); + let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64); + let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64); + let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64); + let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64); + + // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3]. + let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi); + let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi); + let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi); + + // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). + // chroma_dup_i32x4_u16([c0,c1,c2,c3]) → + // lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3] + let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32); + let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32); + let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32); + + // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64. + let y_lo_u32 = u32x4_extend_low_u16x8(y_vec); + let y_hi_u32 = u32x4_extend_high_u16x8(y_vec); + let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32); + let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32); + + let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64); + let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64); + + // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]). + let r_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, r_dup_lo), + i32x4_add(y_hi_scaled, r_dup_hi), + ); + let g_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, g_dup_lo), + i32x4_add(y_hi_scaled, g_dup_hi), + ); + let b_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, b_dup_lo), + i32x4_add(y_hi_scaled, b_dup_hi), + ); + + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + x += 8; } - x += 8; } // Scalar tail. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -373,48 +385,56 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(luma_out.len() >= width); unsafe { - // Y permute: even u16 lanes → low 8 bytes; zeroed high. - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - // 16 px/iter: two groups of 8 Y samples. - while x + 16 <= width { - // lo group: Y0..Y7 from bytes x*2 .. x*2+32. - let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); - let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo0 = u8x16_swizzle(lo0, y_idx); - let y_lo1 = u8x16_swizzle(lo1, y_idx); - let y_lo = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); - - // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64. - let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); - let y_hi0 = u8x16_swizzle(hi0, y_idx); - let y_hi1 = u8x16_swizzle(hi1, y_idx); - let y_hi = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); - - // >> 8: extract high byte of each u16 Y sample. - let y_shr_lo = u16x8_shr(y_lo, 8); - let y_shr_hi = u16x8_shr(y_hi, 8); - // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255). - let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi); - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8); - x += 16; + if !BE { + // Y permute: even u16 lanes → low 8 bytes; zeroed high. + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + // 16 px/iter: two groups of 8 Y samples. + while x + 16 <= width { + // lo group: Y0..Y7 from bytes x*2 .. x*2+32. + let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); + let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo0 = u8x16_swizzle(lo0, y_idx); + let y_lo1 = u8x16_swizzle(lo1, y_idx); + let y_lo = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); + + // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64. + let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); + let y_hi0 = u8x16_swizzle(hi0, y_idx); + let y_hi1 = u8x16_swizzle(hi1, y_idx); + let y_hi = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); + + // >> 8: extract high byte of each u16 Y sample. + let y_shr_lo = u16x8_shr(y_lo, 8); + let y_shr_hi = u16x8_shr(y_hi, 8); + // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255). + let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi); + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8); + x += 16; + } } + // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -432,44 +452,52 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(luma_out.len() >= width); unsafe { - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift). - while x + 16 <= width { - // lo group: Y0..Y7 - let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); - let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo0 = u8x16_swizzle(lo0, y_idx); - let y_lo1 = u8x16_swizzle(lo1, y_idx); - let y_lo = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); - - // hi group: Y8..Y15 - let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); - let y_hi0 = u8x16_swizzle(hi0, y_idx); - let y_hi1 = u8x16_swizzle(hi1, y_idx); - let y_hi = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); - - // Direct store — full 16-bit Y, no shift. - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo); - v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi); - x += 16; + if !BE { + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift). + while x + 16 <= width { + // lo group: Y0..Y7 + let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); + let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo0 = u8x16_swizzle(lo0, y_idx); + let y_lo1 = u8x16_swizzle(lo1, y_idx); + let y_lo = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); + + // hi group: Y8..Y15 + let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); + let y_hi0 = u8x16_swizzle(hi0, y_idx); + let y_hi1 = u8x16_swizzle(hi1, y_idx); + let y_hi = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); + + // Direct store — full 16-bit Y, no shift. + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo); + v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi); + x += 16; + } } + // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/y2xx.rs b/src/row/arch/wasm_simd128/y2xx.rs index 91e77803..83c4a6eb 100644 --- a/src/row/arch/wasm_simd128/y2xx.rs +++ b/src/row/arch/wasm_simd128/y2xx.rs @@ -137,7 +137,11 @@ unsafe fn unpack_y2xx_8px_wasm(ptr: *const u16, shr_count: u32) -> (v128, v128, /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -165,112 +169,115 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits - // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low - // 8 bytes of the result hold the saturated u8 of the input - // i16x8. Only the first 8 bytes per channel matter. - let r_sum = i16x8_add_sat(y_scaled, r_dup); - let g_sum = i16x8_add_sat(y_scaled, g_dup); - let b_sum = i16x8_add_sat(y_scaled, b_dup); - let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum); - let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum); - let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum); - - // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] / - // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so - // for the 8-px-iter body we use the v210-style stack-buffer + - // scalar interleave pattern. (8 px × 3 = 24 bytes RGB, - // 8 px × 4 = 32 bytes RGBA.) - let mut r_tmp = [0u8; 16]; - let mut g_tmp = [0u8; 16]; - let mut b_tmp = [0u8; 16]; - v128_store(r_tmp.as_mut_ptr().cast(), r_u8); - v128_store(g_tmp.as_mut_ptr().cast(), g_u8); - v128_store(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 8 * 4]; - for i in 0..8 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 8 * 3]; - for i in 0..8 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + // Loop-invariant runtime shift count for `u16x8_shr`, see + // module-level note. + let shr_count: u32 = 16 - BITS; + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + // Widen 8-lane i16 chroma to two i32x4 halves so the Q15 + // multiplies don't overflow. Only lanes 0..3 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x8` to recycle the helper exactly; the + // don't-care output lanes are discarded by the [`dup_lo`] + // duplicate step below (which only consumes lanes 0..3). + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + // 8-lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // [`dup_lo`] so lanes 0..7 of `r_dup` align with Y0..Y7. Lane + // order: [c0, c0, c1, c1, c2, c2, c3, c3]. + let r_dup = dup_lo(r_chroma); + let g_dup = dup_lo(g_chroma); + let b_dup = dup_lo(b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits + // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low + // 8 bytes of the result hold the saturated u8 of the input + // i16x8. Only the first 8 bytes per channel matter. + let r_sum = i16x8_add_sat(y_scaled, r_dup); + let g_sum = i16x8_add_sat(y_scaled, g_dup); + let b_sum = i16x8_add_sat(y_scaled, b_dup); + let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum); + let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum); + let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum); + + // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] / + // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so + // for the 8-px-iter body we use the v210-style stack-buffer + + // scalar interleave pattern. (8 px × 3 = 24 bytes RGB, + // 8 px × 4 = 32 bytes RGBA.) + let mut r_tmp = [0u8; 16]; + let mut g_tmp = [0u8; 16]; + let mut b_tmp = [0u8; 16]; + v128_store(r_tmp.as_mut_ptr().cast(), r_u8); + v128_store(g_tmp.as_mut_ptr().cast(), g_u8); + v128_store(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 8 * 4]; + for i in 0..8 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 8 * 3]; + for i in 0..8 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 8; + x += 8; + } } // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -296,7 +303,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -322,72 +333,76 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -413,7 +428,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -430,40 +445,44 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Y permute mask: pick even u16 lanes (low byte at [0], high byte - // at [1]) into the low 8 bytes; high 8 bytes zeroed. - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = v128_load(packed.as_ptr().add(x * 2).cast()); - let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _] - let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _] - // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as - // the 4:2:2 unpack helper. - let y_vec = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`. - let y_shr = u16x8_shr(y_vec, 8); - // Pack 8 i16 lanes to u8 — only low 8 bytes used. - let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr); - // Store low 8 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 16]; - v128_store(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 8].copy_from_slice(&tmp[..8]); - - x += 8; + if !BE { + // Y permute mask: pick even u16 lanes (low byte at [0], high byte + // at [1]) into the low 8 bytes; high 8 bytes zeroed. + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = v128_load(packed.as_ptr().add(x * 2).cast()); + let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _] + let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _] + // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as + // the 4:2:2 unpack helper. + let y_vec = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`. + let y_shr = u16x8_shr(y_vec, 8); + // Pack 8 i16 lanes to u8 — only low 8 bytes used. + let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr); + // Store low 8 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 16]; + v128_store(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 8].copy_from_slice(&tmp[..8]); + + x += 8; + } } + // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -480,7 +499,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -497,29 +516,33 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count: u32 = 16 - BITS; - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = v128_load(packed.as_ptr().add(x * 2).cast()); - let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = u8x16_swizzle(lo, y_idx); - let y_hi = u8x16_swizzle(hi, y_idx); - let y_vec = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = u16x8_shr(y_vec, shr_count); - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 8; + if !BE { + let shr_count: u32 = 16 - BITS; + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = v128_load(packed.as_ptr().add(x * 2).cast()); + let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = u8x16_swizzle(lo, y_idx); + let y_hi = u8x16_swizzle(hi, y_idx); + let y_vec = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = u16x8_shr(y_vec, shr_count); + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 8; + } } + // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/tests/v210.rs b/src/row/arch/x86_avx2/tests/v210.rs index 9c1f8315..d6bf96ae 100644 --- a/src/row/arch/x86_avx2/tests/v210.rs +++ b/src/row/arch/x86_avx2/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v210→luma u16 diverges (width={width})"); } @@ -238,7 +238,7 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx2 v210 luma reorder bug"); @@ -247,9 +247,15 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx2/tests/y216.rs b/src/row/arch/x86_avx2/tests/y216.rs index f7428a32..34cd1b89 100644 --- a/src/row/arch/x86_avx2/tests/y216.rs +++ b/src/row/arch/x86_avx2/tests/y216.rs @@ -16,9 +16,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -33,9 +33,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -49,9 +49,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y216→luma u8 diverges (width={width})"); } @@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y216→luma u16 diverges (width={width})"); } @@ -169,7 +169,7 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "AVX2 y216 luma_u16 reorder bug"); @@ -178,9 +178,15 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx2/tests/y2xx.rs b/src/row/arch/x86_avx2/tests/y2xx.rs index de7fcd45..26825f38 100644 --- a/src/row/arch/x86_avx2/tests/y2xx.rs +++ b/src/row/arch/x86_avx2/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -107,9 +107,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -121,9 +121,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -135,9 +135,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -149,9 +151,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -163,9 +167,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -174,9 +178,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma u16 diverges (width={width})"); } @@ -262,15 +266,15 @@ fn avx2_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "AVX2 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -278,7 +282,7 @@ fn avx2_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -290,17 +294,17 @@ fn avx2_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "AVX2 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "AVX2 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_avx2/v210.rs b/src/row/arch/x86_avx2/v210.rs index 49407edd..13164309 100644 --- a/src/row/arch/x86_avx2/v210.rs +++ b/src/row/arch/x86_avx2/v210.rs @@ -34,7 +34,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x8, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks two consecutive 16-byte v210 words (= 12 pixels) into @@ -63,11 +63,11 @@ use crate::{ColorMatrix, row::scalar}; /// `target_feature` includes AVX2 (which implies AVX, SSSE3, etc.). #[inline] #[target_feature(enable = "avx2")] -unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) { +unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) { // SAFETY: caller obligation — `ptr` has 32 bytes readable; AVX2 // (and thus SSSE3) is available. unsafe { - let words = _mm256_loadu_si256(ptr.cast()); + let words = load_endian_u32x8::(ptr); let mask10 = _mm256_set1_epi32(0x3FF); let low10 = _mm256_and_si256(words, mask10); let mid10 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask10); @@ -224,7 +224,7 @@ unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -263,7 +263,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( // Main loop: 12 pixels (2 v210 words = 32 bytes) per iteration. let pairs = words / 2; for p in 0..pairs { - let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); let y_i16 = y_vec; @@ -369,7 +369,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -386,7 +392,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -424,7 +430,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let pairs = words / 2; for p in 0..pairs { - let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); let y_i16 = y_vec; let u_i16 = _mm256_sub_epi16(u_vec, bias_v); @@ -503,7 +509,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -526,7 +532,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -537,7 +547,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: unsafe { let pairs = words / 2; for p in 0..pairs { - let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, _, _) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x32 via packus. let y_shr = _mm256_srli_epi16::<2>(y_vec); let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); @@ -554,7 +564,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -571,7 +581,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -582,7 +596,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w unsafe { let pairs = words / 2; for p in 0..pairs { - let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, _, _) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); // Store first 12 of the 16 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 16]; _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_vec); @@ -596,7 +610,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/y216.rs b/src/row/arch/x86_avx2/y216.rs index 4184b3bb..cf850e18 100644 --- a/src/row/arch/x86_avx2/y216.rs +++ b/src/row/arch/x86_avx2/y216.rs @@ -109,7 +109,7 @@ unsafe fn unpack_y216_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -128,137 +128,146 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { - let rnd_v = _mm256_set1_epi32(RND); - // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off. - let y_off_v = _mm256_set1_epi32(y_off); - let y_scale_v = _mm256_set1_epi32(y_scale); - let c_scale_v = _mm256_set1_epi32(c_scale); - // Chroma bias: 32768 via wrapping 0x8000 = -32768i16. - let bias16_v = _mm256_set1_epi16(-32768i16); - let cru = _mm256_set1_epi32(coeffs.r_u()); - let crv = _mm256_set1_epi32(coeffs.r_v()); - let cgu = _mm256_set1_epi32(coeffs.g_u()); - let cgv = _mm256_set1_epi32(coeffs.g_v()); - let cbu = _mm256_set1_epi32(coeffs.b_u()); - let cbv = _mm256_set1_epi32(coeffs.b_v()); - let alpha_u8 = _mm256_set1_epi8(-1i8); - let mut x = 0usize; - while x + 32 <= width { - // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ------ - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); - - // Chroma bias subtraction (wrapping). - let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); - let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v); - - // Widen 8 valid chroma i16 lanes to two i32x8 halves. - // Only the low 128 bits of u_lo_vec carry valid U0..U7; - // the high 128 bits are zeroed by the 0x88 permute (don't-care). - let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16)); - let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16)); - let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16)); - let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16)); - - let u_d_lo_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_a, c_scale_v), - rnd_v, - )); - let u_d_lo_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_b, c_scale_v), - rnd_v, - )); - let v_d_lo_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_a, c_scale_v), - rnd_v, - )); - let v_d_lo_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_b, c_scale_v), - rnd_v, - )); - - // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo). - let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - - // Duplicate each chroma into its 4:2:2 Y-pair slot. - // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used - // here since we have only 8 chroma samples per 16-px half. - let (r_dup_lo, _) = chroma_dup(r_chroma_lo); - let (g_dup_lo, _) = chroma_dup(g_chroma_lo); - let (b_dup_lo, _) = chroma_dup(b_chroma_lo); - - // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v); - - // --- hi group: pixels x+16..x+31 ----------------------------------- - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32)); - - let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); - let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v); - - let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16)); - let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16)); - let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16)); - let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16)); - - let u_d_hi_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_a, c_scale_v), - rnd_v, - )); - let u_d_hi_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_b, c_scale_v), - rnd_v, - )); - let v_d_hi_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_a, c_scale_v), - rnd_v, - )); - let v_d_hi_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_b, c_scale_v), - rnd_v, - )); - - let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - - let (r_dup_hi, _) = chroma_dup(r_chroma_hi); - let (g_dup_hi, _) = chroma_dup(g_chroma_hi); - let (b_dup_hi, _) = chroma_dup(b_chroma_hi); - - let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v); - - // Saturating add + narrow to u8x32 (32 pixels per channel). - let r_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, r_dup_lo), - _mm256_adds_epi16(y_hi_scaled, r_dup_hi), - ); - let g_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, g_dup_lo), - _mm256_adds_epi16(y_hi_scaled, g_dup_hi), - ); - let b_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, b_dup_lo), - _mm256_adds_epi16(y_hi_scaled, b_dup_hi), - ); + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off. + let y_off_v = _mm256_set1_epi32(y_off); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + // Chroma bias: 32768 via wrapping 0x8000 = -32768i16. + let bias16_v = _mm256_set1_epi16(-32768i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1i8); + + while x + 32 <= width { + // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ---- + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); + + // Chroma bias subtraction (wrapping). + let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); + let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v); + + // Widen 8 valid chroma i16 lanes to two i32x8 halves. + // Only the low 128 bits of u_lo_vec carry valid U0..U7; + // the high 128 bits are zeroed by the 0x88 permute (don't-care). + let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16)); + let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16)); + let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16)); + let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16)); + + let u_d_lo_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_a, c_scale_v), + rnd_v, + )); + let u_d_lo_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_b, c_scale_v), + rnd_v, + )); + let v_d_lo_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_a, c_scale_v), + rnd_v, + )); + let v_d_lo_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_b, c_scale_v), + rnd_v, + )); + + // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo). + let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + + // Duplicate each chroma into its 4:2:2 Y-pair slot. + // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used + // here since we have only 8 chroma samples per 16-px half. + let (r_dup_lo, _) = chroma_dup(r_chroma_lo); + let (g_dup_lo, _) = chroma_dup(g_chroma_lo); + let (b_dup_lo, _) = chroma_dup(b_chroma_lo); + + // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v); + + // --- hi group: pixels x+16..x+31 ----------------------------------- + let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32)); + + let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); + let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v); + + let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16)); + let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16)); + let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16)); + let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16)); + + let u_d_hi_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_a, c_scale_v), + rnd_v, + )); + let u_d_hi_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_b, c_scale_v), + rnd_v, + )); + let v_d_hi_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_a, c_scale_v), + rnd_v, + )); + let v_d_hi_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_b, c_scale_v), + rnd_v, + )); + + let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + + let (r_dup_hi, _) = chroma_dup(r_chroma_hi); + let (g_dup_hi, _) = chroma_dup(g_chroma_hi); + let (b_dup_hi, _) = chroma_dup(b_chroma_hi); + + let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v); + + // Saturating add + narrow to u8x32 (32 pixels per channel). + let r_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, r_dup_lo), + _mm256_adds_epi16(y_hi_scaled, r_dup_hi), + ); + let g_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, g_dup_lo), + _mm256_adds_epi16(y_hi_scaled, g_dup_hi), + ); + let b_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, b_dup_lo), + _mm256_adds_epi16(y_hi_scaled, b_dup_hi), + ); - if ALPHA { - write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); - } + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } - x += 32; + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -280,7 +289,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -298,132 +307,135 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_v = _mm256_set1_epi64x(RND); - let rnd32_v = _mm256_set1_epi32(1 << 14); - let y_off_v = _mm256_set1_epi32(y_off); - let y_scale_v = _mm256_set1_epi32(y_scale); - let c_scale_v = _mm256_set1_epi32(c_scale); - // Chroma bias via wrapping 0x8000 trick. - let bias16_v = _mm256_set1_epi16(-32768i16); - let cru = _mm256_set1_epi32(coeffs.r_u()); - let crv = _mm256_set1_epi32(coeffs.r_v()); - let cgu = _mm256_set1_epi32(coeffs.g_u()); - let cgv = _mm256_set1_epi32(coeffs.g_v()); - let cbu = _mm256_set1_epi32(coeffs.b_u()); - let cbv = _mm256_set1_epi32(coeffs.b_v()); - let mut x = 0usize; - while x + 16 <= width { - // Two 256-bit loads → 16 pixels, 8 UV pairs. - let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); - - // Subtract chroma bias. - let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); - let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); - - // Widen 8 valid chroma i16 lanes to i32x8. - // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute. - let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); - let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - - // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32). - let u_d = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_i32, c_scale_v), - rnd32_v, - )); - let v_d = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_i32, c_scale_v), - rnd32_v, - )); - - // i64 chroma: even/odd i32 lanes via 0xF5 shuffle. - let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d); - let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v); - let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v); - let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v); - let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); - let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v); - let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); - - // Reassemble i64x4 pairs → i32x8 [c0..c7]. - let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd); - let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd); - let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd); - - // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2). - let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32); - let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32); - let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32); - - // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. - // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes. - let y_lo_u16 = _mm256_castsi256_si128(y_vec); - let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec); - let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v); - let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v); - - let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v); - - // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup. - let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, r_dup_lo), - _mm256_add_epi32(y_hi_scaled, r_dup_hi), - )); - let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, g_dup_lo), - _mm256_add_epi32(y_hi_scaled, g_dup_hi), - )); - let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, b_dup_lo), - _mm256_add_epi32(y_hi_scaled, b_dup_hi), - )); - - // Write 16 pixels via two 8-pixel helpers. - if ALPHA { - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - alpha_u16, - dst, - ); - write_rgba_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - alpha_u16, - dst.add(32), - ); - } else { - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_v = _mm256_set1_epi64x(RND); + let rnd32_v = _mm256_set1_epi32(1 << 14); + let y_off_v = _mm256_set1_epi32(y_off); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + // Chroma bias via wrapping 0x8000 trick. + let bias16_v = _mm256_set1_epi16(-32768i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + // Two 256-bit loads → 16 pixels, 8 UV pairs. + let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); + + // Subtract chroma bias. + let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); + + // Widen 8 valid chroma i16 lanes to i32x8. + // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute. + let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + + // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32). + let u_d = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_i32, c_scale_v), + rnd32_v, + )); + let v_d = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_i32, c_scale_v), + rnd32_v, + )); + + // i64 chroma: even/odd i32 lanes via 0xF5 shuffle. + let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d); + let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v); + let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v); + let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v); + let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); + let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v); + let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); + + // Reassemble i64x4 pairs → i32x8 [c0..c7]. + let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd); + let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd); + let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd); + + // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2). + let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32); + let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32); + let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32); + + // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. + // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes. + let y_lo_u16 = _mm256_castsi256_si128(y_vec); + let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec); + let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v); + let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v); + + let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v); + + // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup. + let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, r_dup_lo), + _mm256_add_epi32(y_hi_scaled, r_dup_hi), + )); + let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, g_dup_lo), + _mm256_add_epi32(y_hi_scaled, g_dup_hi), + )); + let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, b_dup_lo), + _mm256_add_epi32(y_hi_scaled, b_dup_hi), + )); + + // Write 16 pixels via two 8-pixel helpers. + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } + + x += 16; } - - x += 16; } // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -450,62 +462,69 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX2 availability is the caller's obligation. unsafe { - // Per-lane Y permute mask: pick even u16 lanes (low byte first) into - // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed. - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 32 <= width { - // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31. - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); - let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); - - // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane. - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v2s = _mm256_shuffle_epi8(v2, split_idx); - let v3s = _mm256_shuffle_epi8(v3, split_idx); - - // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits. - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); - let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); - - // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16). - let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] - let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] - - // `>> 8` to obtain u8 luma (high byte of each Y u16 sample). - // `_mm256_srli_epi16::<8>` has a literal const count. - let y_lo_shr = _mm256_srli_epi16::<8>(y_lo); - let y_hi_shr = _mm256_srli_epi16::<8>(y_hi); - - // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup. - let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr); - _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 32; + if !BE { + // Per-lane Y permute mask: pick even u16 lanes (low byte first) into + // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed. + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 32 <= width { + // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31. + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); + let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); + + // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane. + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v2s = _mm256_shuffle_epi8(v2, split_idx); + let v3s = _mm256_shuffle_epi8(v3, split_idx); + + // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits. + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); + let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); + + // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16). + let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] + let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] + + // `>> 8` to obtain u8 luma (high byte of each Y u16 sample). + // `_mm256_srli_epi16::<8>` has a literal const count. + let y_lo_shr = _mm256_srli_epi16::<8>(y_lo); + let y_hi_shr = _mm256_srli_epi16::<8>(y_hi); + + // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup. + let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr); + _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -526,52 +545,59 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX2 availability is the caller's obligation. unsafe { - // Per-lane Y permute mask (same as luma_row above). - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1, - ); - let mut x = 0usize; - while x + 32 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); - let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); - - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v2s = _mm256_shuffle_epi8(v2, split_idx); - let v3s = _mm256_shuffle_epi8(v3, split_idx); - - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); - let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); - - let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] - let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] - - // Direct store — full 16-bit Y values, no shift. - _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo); - _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi); - - x += 32; + if !BE { + // Per-lane Y permute mask (same as luma_row above). + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, + -1, -1, -1, -1, -1, -1, + ); + + while x + 32 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); + let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); + + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v2s = _mm256_shuffle_epi8(v2, split_idx); + let v3s = _mm256_shuffle_epi8(v3, split_idx); + + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); + let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); + + let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] + let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] + + // Direct store — full 16-bit Y values, no shift. + _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo); + _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi); + + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/y2xx.rs b/src/row/arch/x86_avx2/y2xx.rs index 1b9d76f3..bc3c5bb1 100644 --- a/src/row/arch/x86_avx2/y2xx.rs +++ b/src/row/arch/x86_avx2/y2xx.rs @@ -164,7 +164,11 @@ unsafe fn unpack_y2xx_16px_avx2( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -192,122 +196,125 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - // 16-lane chroma vectors with valid data in lanes 0..7. - let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via - // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15. - // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32 - // chroma; we have only 8). - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8 - // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so - // the low 16 bytes hold the saturated u8 of our 16 valid lanes. - let zero = _mm256_setzero_si256(); - let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero); - let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero); - let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero); - - // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit - // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel - // iter. Use the v210-style stack-buffer + scalar interleave - // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.) - let mut r_tmp = [0u8; 32]; - let mut g_tmp = [0u8; 32]; - let mut b_tmp = [0u8; 32]; - _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8); - _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8); - _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 16 * 4]; - for i in 0..16 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 16 * 3]; - for i in 0..16 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm256_srl_epi16` — see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + // Widen 8-valid-lane i16 chroma to two i32x8 halves so the Q15 + // multiplies don't overflow. Only lanes 0..7 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x16` to recycle the helper exactly; the + // don't-care output lanes are discarded by the + // `chroma_dup` step below (which only consumes lanes 0..7 in + // its `lo16` return). + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + // 16-lane chroma vectors with valid data in lanes 0..7. + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15. + // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32 + // chroma; we have only 8). + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8 + // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so + // the low 16 bytes hold the saturated u8 of our 16 valid lanes. + let zero = _mm256_setzero_si256(); + let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero); + let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero); + let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero); + + // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit + // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel + // iter. Use the v210-style stack-buffer + scalar interleave + // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.) + let mut r_tmp = [0u8; 32]; + let mut g_tmp = [0u8; 32]; + let mut b_tmp = [0u8; 32]; + _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8); + _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8); + _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 16 * 4]; + for i in 0..16 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 16 * 3]; + for i in 0..16 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 16; + x += 16; + } } // Scalar tail — remaining < 16 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -334,7 +341,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -360,112 +371,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2 - // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`. - let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); - let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); - let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); - - // 16-pixel u16 store: split each i16x16 channel into two - // 128-bit halves and use the SSE4.1 u16 interleave helpers - // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as - // the AVX2 high-bit YUV planar u16 path. - if ALPHA { - let alpha_u16 = _mm_set1_epi16(out_max); - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_u16_8( - _mm256_castsi256_si128(r), - _mm256_castsi256_si128(g), - _mm256_castsi256_si128(b), - alpha_u16, - dst, - ); - write_rgba_u16_8( - _mm256_extracti128_si256::<1>(r), - _mm256_extracti128_si256::<1>(g), - _mm256_extracti128_si256::<1>(b), - alpha_u16, - dst.add(32), - ); - } else { - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r), - _mm256_castsi256_si128(g), - _mm256_castsi256_si128(b), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r), - _mm256_extracti128_si256::<1>(g), - _mm256_extracti128_si256::<1>(b), - dst.add(24), - ); - } + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm256_set1_epi16(out_max); + let zero_v = _mm256_set1_epi16(0); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2 + // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`. + let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); + let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); + let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); + + // 16-pixel u16 store: split each i16x16 channel into two + // 128-bit halves and use the SSE4.1 u16 interleave helpers + // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as + // the AVX2 high-bit YUV planar u16 path. + if ALPHA { + let alpha_u16 = _mm_set1_epi16(out_max); + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r), + _mm256_castsi256_si128(g), + _mm256_castsi256_si128(b), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r), + _mm256_extracti128_si256::<1>(g), + _mm256_extracti128_si256::<1>(b), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r), + _mm256_castsi256_si128(g), + _mm256_castsi256_si128(b), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r), + _mm256_extracti128_si256::<1>(g), + _mm256_extracti128_si256::<1>(b), + dst.add(24), + ); + } - x += 16; + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -491,7 +504,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -508,50 +521,52 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Per-lane Y permute mask: pick even u16 lanes (low byte at [0], - // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed. - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 16 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y - // values in its two lanes' low 64 bits. Pack lane0_low and - // lane1_low into the low 128 bits of each vector via - // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]). - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes). - // Low 128 of v1p = [Y8..Y15]. - // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low). - let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a - // literal const count, so it works without runtime-count helper. - let y_shr = _mm256_srli_epi16::<8>(y_vec); - // Pack 16 i16 lanes to u8 — only low 16 bytes used. - let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); - // Store low 16 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 32]; - _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 16].copy_from_slice(&tmp[..16]); - - x += 16; + if !BE { + // Per-lane Y permute mask: pick even u16 lanes (low byte at [0], + // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed. + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 16 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y + // values in its two lanes' low 64 bits. Pack lane0_low and + // lane1_low into the low 128 bits of each vector via + // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]). + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes). + // Low 128 of v1p = [Y8..Y15]. + // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low). + let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a + // literal const count, so it works without runtime-count helper. + let y_shr = _mm256_srli_epi16::<8>(y_vec); + // Pack 16 i16 lanes to u8 — only low 16 bytes used. + let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); + // Store low 16 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 32]; + _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 16].copy_from_slice(&tmp[..16]); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -569,7 +584,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -586,33 +601,35 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 16 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = _mm256_srl_epi16(y_vec, shr_count); - _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 16; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 16 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = _mm256_srl_epi16(y_vec, shr_count); + _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/tests/v210.rs b/src/row/arch/x86_avx512/tests/v210.rs index 0abf4bae..f2652cb8 100644 --- a/src/row/arch/x86_avx512/tests/v210.rs +++ b/src/row/arch/x86_avx512/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v210→luma u16 diverges (width={width})"); } @@ -250,7 +250,7 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx512 v210 luma reorder bug"); @@ -259,9 +259,15 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx512/tests/y216.rs b/src/row/arch/x86_avx512/tests/y216.rs index 93fa76cc..ae8b2bc7 100644 --- a/src/row/arch/x86_avx512/tests/y216.rs +++ b/src/row/arch/x86_avx512/tests/y216.rs @@ -16,9 +16,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -33,9 +33,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -49,9 +49,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y216→luma u8 diverges (width={width})"); } @@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y216→luma u16 diverges (width={width})"); } @@ -178,7 +178,7 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "AVX-512 y216 luma_u16 reorder bug"); @@ -187,9 +187,15 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx512/tests/y2xx.rs b/src/row/arch/x86_avx512/tests/y2xx.rs index fd5ccbad..dc609f8d 100644 --- a/src/row/arch/x86_avx512/tests/y2xx.rs +++ b/src/row/arch/x86_avx512/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -111,9 +111,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -125,9 +125,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -139,9 +139,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -153,9 +155,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -167,9 +171,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -178,9 +182,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -278,15 +282,15 @@ fn avx512_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "AVX-512 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -294,7 +298,7 @@ fn avx512_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -309,17 +313,17 @@ fn avx512_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "AVX-512 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "AVX-512 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_avx512/v210.rs b/src/row/arch/x86_avx512/v210.rs index e5a77eb4..8c68f16d 100644 --- a/src/row/arch/x86_avx512/v210.rs +++ b/src/row/arch/x86_avx512/v210.rs @@ -40,7 +40,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x16, *}; use crate::{ColorMatrix, row::scalar}; // ---- Static permute index tables -------------------------------------- @@ -187,11 +187,11 @@ static V_FROM_MID: [i16; 32] = [ /// `permutexvar` op `vpermw`). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i) { +unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i) { // SAFETY: caller obligation — `ptr` has 64 bytes readable; AVX-512F // + AVX-512BW are available. unsafe { - let words = _mm512_loadu_si512(ptr.cast()); + let words = load_endian_u32x16::(ptr); let mask10 = _mm512_set1_epi32(0x3FF); let low10 = _mm512_and_si512(words, mask10); let mid10 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask10); @@ -247,7 +247,7 @@ unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -290,7 +290,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( // Main loop: 24 pixels (4 v210 words = 64 bytes) per iteration. let quads = words / 4; for q in 0..quads { - let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); let y_i16 = y_vec; @@ -392,7 +392,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -409,7 +415,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -451,7 +457,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let quads = words / 4; for q in 0..quads { - let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); let y_i16 = y_vec; let u_i16 = _mm512_sub_epi16(u_vec, bias_v); @@ -529,7 +535,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -552,7 +558,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -566,7 +576,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let quads = words / 4; for q in 0..quads { - let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, _, _) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x64 via packus // (only first 32 lanes carry data, paired with a zero hi half; // first 24 bytes of the result are valid Y0..Y23). @@ -585,7 +595,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -602,7 +612,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -613,7 +627,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w unsafe { let quads = words / 4; for q in 0..quads { - let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, _, _) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); // Store first 24 of the 32 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 32]; _mm512_storeu_si512(tmp.as_mut_ptr().cast(), y_vec); @@ -627,7 +641,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/y216.rs b/src/row/arch/x86_avx512/y216.rs index 2a60b3b3..be564433 100644 --- a/src/row/arch/x86_avx512/y216.rs +++ b/src/row/arch/x86_avx512/y216.rs @@ -118,7 +118,7 @@ unsafe fn unpack_y216_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -137,150 +137,160 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let rnd_v = _mm512_set1_epi32(RND); - let y_off_v = _mm512_set1_epi32(y_off); - let y_scale_v = _mm512_set1_epi32(y_scale); - let c_scale_v = _mm512_set1_epi32(c_scale); - // Chroma bias: 32768 via wrapping -32768 i16. - let bias16_v = _mm512_set1_epi16(-32768i16); - let cru = _mm512_set1_epi32(coeffs.r_u()); - let crv = _mm512_set1_epi32(coeffs.r_v()); - let cgu = _mm512_set1_epi32(coeffs.g_u()); - let cgv = _mm512_set1_epi32(coeffs.g_v()); - let cbu = _mm512_set1_epi32(coeffs.b_u()); - let cbv = _mm512_set1_epi32(coeffs.b_v()); - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); - let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); - let mut x = 0usize; - while x + 64 <= width { - // --- lo group: pixels x..x+31 (32 pixels) -------------------------- - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); - - let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); - let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v); - - // Widen 16 valid U/V i16 lanes to two i32x16 halves. - let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16)); - let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16)); - let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16)); - let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16)); - - let u_d_lo_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_a, c_scale_v), - rnd_v, - )); - let u_d_lo_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_b, c_scale_v), - rnd_v, - )); - let v_d_lo_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_a, c_scale_v), - rnd_v, - )); - let v_d_lo_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_b, c_scale_v), - rnd_v, - )); - - // chroma_i16x32: 32-lane vector, valid data in lanes 0..16. - let r_chroma_lo = chroma_i16x32( - cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - let g_chroma_lo = chroma_i16x32( - cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - let b_chroma_lo = chroma_i16x32( - cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - - // Duplicate each chroma sample into its 4:2:2 Y-pair slot. - // 16 valid chroma → lo32 covers all 32 Y lanes. - let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx); - - // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // --- hi group: pixels x+32..x+63 (32 pixels) ---------------------- - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64)); - - let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); - let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v); - - let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16)); - let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16)); - let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16)); - let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16)); - - let u_d_hi_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_a, c_scale_v), - rnd_v, - )); - let u_d_hi_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_b, c_scale_v), - rnd_v, - )); - let v_d_hi_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_a, c_scale_v), - rnd_v, - )); - let v_d_hi_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_b, c_scale_v), - rnd_v, - )); - - let r_chroma_hi = chroma_i16x32( - cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - let g_chroma_hi = chroma_i16x32( - cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - let b_chroma_hi = chroma_i16x32( - cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - - let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx); - let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx); - let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx); - - let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Saturating i16 add + narrow to u8x64 per channel. - let r_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, r_dup_lo), - _mm512_adds_epi16(y_hi_scaled, r_dup_hi), - pack_fixup, - ); - let g_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, g_dup_lo), - _mm512_adds_epi16(y_hi_scaled, g_dup_hi), - pack_fixup, - ); - let b_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, b_dup_lo), - _mm512_adds_epi16(y_hi_scaled, b_dup_hi), - pack_fixup, - ); - - if ALPHA { - let alpha = _mm512_set1_epi8(-1); - write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi32(y_off); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + // Chroma bias: 32768 via wrapping -32768 i16. + let bias16_v = _mm512_set1_epi16(-32768i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 64 <= width { + // --- lo group: pixels x..x+31 (32 pixels) ------------------------ + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); + + let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); + let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v); + + // Widen 16 valid U/V i16 lanes to two i32x16 halves. + let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16)); + let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16)); + let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16)); + let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16)); + + let u_d_lo_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_a, c_scale_v), + rnd_v, + )); + let u_d_lo_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_b, c_scale_v), + rnd_v, + )); + let v_d_lo_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_a, c_scale_v), + rnd_v, + )); + let v_d_lo_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_b, c_scale_v), + rnd_v, + )); + + // chroma_i16x32: 32-lane vector, valid data in lanes 0..16. + let r_chroma_lo = chroma_i16x32( + cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + let g_chroma_lo = chroma_i16x32( + cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + let b_chroma_lo = chroma_i16x32( + cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + + // Duplicate each chroma sample into its 4:2:2 Y-pair slot. + // 16 valid chroma → lo32 covers all 32 Y lanes. + let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx); + + // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // --- hi group: pixels x+32..x+63 (32 pixels) ---------------------- + let (y_hi_vec, u_hi_vec, v_hi_vec) = + unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64)); + + let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); + let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v); + + let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16)); + let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16)); + let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16)); + let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16)); + + let u_d_hi_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_a, c_scale_v), + rnd_v, + )); + let u_d_hi_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_b, c_scale_v), + rnd_v, + )); + let v_d_hi_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_a, c_scale_v), + rnd_v, + )); + let v_d_hi_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_b, c_scale_v), + rnd_v, + )); + + let r_chroma_hi = chroma_i16x32( + cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + let g_chroma_hi = chroma_i16x32( + cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + let b_chroma_hi = chroma_i16x32( + cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + + let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx); + let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx); + let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx); + + let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Saturating i16 add + narrow to u8x64 per channel. + let r_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, r_dup_lo), + _mm512_adds_epi16(y_hi_scaled, r_dup_hi), + pack_fixup, + ); + let g_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, g_dup_lo), + _mm512_adds_epi16(y_hi_scaled, g_dup_hi), + pack_fixup, + ); + let b_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, b_dup_lo), + _mm512_adds_epi16(y_hi_scaled, b_dup_hi), + pack_fixup, + ); + + if ALPHA { + let alpha = _mm512_set1_epi8(-1); + write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + + x += 64; } - - x += 64; } // Scalar tail — remaining < 64 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -301,7 +311,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -320,125 +330,130 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_i64_v = _mm512_set1_epi64(RND_I64); - let rnd_i32_v = _mm512_set1_epi32(RND_I32); - let y_off_v = _mm512_set1_epi32(y_off); - let y_scale_v = _mm512_set1_epi32(y_scale); - let c_scale_v = _mm512_set1_epi32(c_scale); - let bias16_v = _mm512_set1_epi16(-32768i16); - let cru = _mm512_set1_epi32(coeffs.r_u()); - let crv = _mm512_set1_epi32(coeffs.r_v()); - let cgu = _mm512_set1_epi32(coeffs.g_u()); - let cgv = _mm512_set1_epi32(coeffs.g_v()); - let cbu = _mm512_set1_epi32(coeffs.b_u()); - let cbv = _mm512_set1_epi32(coeffs.b_v()); - - // Permute indices built once. - // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots. - let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); - let dup_hi_idx = _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15); - // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...]. - let interleave_idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let mut x = 0usize; - while x + 32 <= width { - // One deinterleave gives 32 Y + 16 UV pairs. - let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); - - // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16). - let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); - let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); - - // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale. - // High 256 bits of u_vec / v_vec hold don't-care values after the - // U/V split permute; they won't reach chroma_i64x8_avx512. - let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); - let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - - // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 → - // product ≤ ~1.26·10⁹ — fits i32. - let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( - _mm512_mullo_epi32(u_i32, c_scale_v), - rnd_i32_v, - )); - let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( - _mm512_mullo_epi32(v_i32, c_scale_v), - rnd_i32_v, - )); - - // i64 chroma: even and odd i32 lanes separately. - let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d); - let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v); - let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v); - let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v); - let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v); - let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v); - let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v); - - // Reassemble i64x8 pairs → i32x16 [c0..c15]. - let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx); - let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx); - let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx); - - // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling). - let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32); - let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32); - let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32); - let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32); - let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32); - let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32); - - // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64. - let y_lo_u16 = _mm512_castsi512_si256(y_vec); - let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec); - let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v); - let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v); - - let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx); - let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx); - - // Y + chroma → pack with unsigned saturation to u16x32. - let r_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, r_dup_lo), - _mm512_add_epi32(y_hi_scaled, r_dup_hi), - ), - ); - let g_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, g_dup_lo), - _mm512_add_epi32(y_hi_scaled, g_dup_hi), - ), - ); - let b_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, b_dup_lo), - _mm512_add_epi32(y_hi_scaled, b_dup_hi), - ), - ); - - if ALPHA { - write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_i64_v = _mm512_set1_epi64(RND_I64); + let rnd_i32_v = _mm512_set1_epi32(RND_I32); + let y_off_v = _mm512_set1_epi32(y_off); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias16_v = _mm512_set1_epi16(-32768i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + // Permute indices built once. + // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots. + let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); + let dup_hi_idx = + _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15); + // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...]. + let interleave_idx = + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + + while x + 32 <= width { + // One deinterleave gives 32 Y + 16 UV pairs. + let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); + + // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16). + let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); + + // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale. + // High 256 bits of u_vec / v_vec hold don't-care values after the + // U/V split permute; they won't reach chroma_i64x8_avx512. + let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + + // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 → + // product ≤ ~1.26·10⁹ — fits i32. + let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( + _mm512_mullo_epi32(u_i32, c_scale_v), + rnd_i32_v, + )); + let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( + _mm512_mullo_epi32(v_i32, c_scale_v), + rnd_i32_v, + )); + + // i64 chroma: even and odd i32 lanes separately. + let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d); + let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v); + let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v); + let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v); + let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v); + let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v); + let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v); + + // Reassemble i64x8 pairs → i32x16 [c0..c15]. + let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx); + let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx); + let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx); + + // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling). + let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32); + let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32); + let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32); + let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32); + let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32); + let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32); + + // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64. + let y_lo_u16 = _mm512_castsi512_si256(y_vec); + let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec); + let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v); + let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v); + + let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx); + let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx); + + // Y + chroma → pack with unsigned saturation to u16x32. + let r_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, r_dup_lo), + _mm512_add_epi32(y_hi_scaled, r_dup_hi), + ), + ); + let g_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, g_dup_lo), + _mm512_add_epi32(y_hi_scaled, g_dup_hi), + ), + ); + let b_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, b_dup_lo), + _mm512_add_epi32(y_hi_scaled, b_dup_hi), + ), + ); + + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + + x += 32; } - - x += 32; } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -464,43 +479,49 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 64 <= width { - // lo group: pixels x..x+31 - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); - let y_lo_shr = _mm512_srli_epi16::<8>(y_lo); - - // hi group: pixels x+32..x+63 - let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); - let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); - let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); - let y_hi_shr = _mm512_srli_epi16::<8>(y_hi); - - // Pack 64 × i16 → 64 × u8 with natural order. - let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup); - // Store all 64 bytes at once. - _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 64; + if !BE { + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 64 <= width { + // lo group: pixels x..x+31 + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); + let y_lo_shr = _mm512_srli_epi16::<8>(y_lo); + + // hi group: pixels x+32..x+63 + let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); + let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); + let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); + let y_hi_shr = _mm512_srli_epi16::<8>(y_hi); + + // Pack 64 × i16 → 64 × u8 with natural order. + let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup); + // Store all 64 bytes at once. + _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 64; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -520,39 +541,45 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 64 <= width { - // lo group: pixels x..x+31 - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); - - // hi group: pixels x+32..x+63 - let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); - let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); - let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); - - // Direct store — full 16-bit Y values, no shift. - _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo); - _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi); - - x += 64; + if !BE { + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 64 <= width { + // lo group: pixels x..x+31 + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); + + // hi group: pixels x+32..x+63 + let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); + let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); + let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); + + // Direct store — full 16-bit Y values, no shift. + _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo); + _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi); + + x += 64; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/y2xx.rs b/src/row/arch/x86_avx512/y2xx.rs index 4944cc6d..1d2b1dcd 100644 --- a/src/row/arch/x86_avx512/y2xx.rs +++ b/src/row/arch/x86_avx512/y2xx.rs @@ -177,7 +177,11 @@ unsafe fn unpack_y2xx_32px_avx512( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -205,132 +209,135 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - // i16x32 chroma vectors with valid data in lanes 0..16. - let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - - // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup` - // duplicates each of 32 chroma lanes into its pair slot, - // splitting across two i16x32 vectors. With 16 valid chroma in - // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1, - // ..., c15,c15]); `hi32` is don't-care. - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Per-channel saturating add (i16x32). All 32 lanes valid. - let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo); - let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo); - let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo); - - // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)` - // packs 32 i16 lanes of `lo` to u8 in the result's first 32 - // bytes (next 32 zero, after the lane-fixup permute). - let zero = _mm512_setzero_si512(); - let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup); - let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup); - let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup); - - // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls - // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32` - // pulls the two valid 128-bit halves out of the u8x64 result. - if ALPHA { - let alpha = _mm_set1_epi8(-1); - let r0 = _mm512_castsi512_si128(r_u8); - let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); - let g0 = _mm512_castsi512_si128(g_u8); - let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); - let b0 = _mm512_castsi512_si128(b_u8); - let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_16(r0, g0, b0, alpha, dst); - write_rgba_16(r1, g1, b1, alpha, dst.add(64)); - } else { - let r0 = _mm512_castsi512_si128(r_u8); - let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); - let g0 = _mm512_castsi512_si128(g_u8); - let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); - let b0 = _mm512_castsi512_si128(b_u8); - let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_16(r0, g0, b0, dst); - write_rgb_16(r1, g1, b1, dst.add(48)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm512_srl_epi16` — see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 32 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. Only lanes 0..16 + // carry valid samples; the bias subtraction on don't-care lanes + // is harmless since they're discarded by `chroma_dup`'s `hi32`. + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + // Widen 16-valid-lane i16 chroma to two i32x16 halves so the + // Q15 multiplies don't overflow. Only lanes 0..16 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x32` to recycle the helper exactly; the + // don't-care output lanes 16..32 are discarded by `chroma_dup`'s + // `hi32` return below (which only consumes lanes 0..16 in its + // `lo32` return). + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + // i16x32 chroma vectors with valid data in lanes 0..16. + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup` + // duplicates each of 32 chroma lanes into its pair slot, + // splitting across two i16x32 vectors. With 16 valid chroma in + // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1, + // ..., c15,c15]); `hi32` is don't-care. + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Per-channel saturating add (i16x32). All 32 lanes valid. + let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo); + let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo); + let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo); + + // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)` + // packs 32 i16 lanes of `lo` to u8 in the result's first 32 + // bytes (next 32 zero, after the lane-fixup permute). + let zero = _mm512_setzero_si512(); + let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup); + let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup); + let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup); + + // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls + // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32` + // pulls the two valid 128-bit halves out of the u8x64 result. + if ALPHA { + let alpha = _mm_set1_epi8(-1); + let r0 = _mm512_castsi512_si128(r_u8); + let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); + let g0 = _mm512_castsi512_si128(g_u8); + let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); + let b0 = _mm512_castsi512_si128(b_u8); + let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_16(r0, g0, b0, alpha, dst); + write_rgba_16(r1, g1, b1, alpha, dst.add(64)); + } else { + let r0 = _mm512_castsi512_si128(r_u8); + let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); + let g0 = _mm512_castsi512_si128(g_u8); + let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); + let b0 = _mm512_castsi512_si128(b_u8); + let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_16(r0, g0, b0, dst); + write_rgb_16(r1, g1, b1, dst.add(48)); + } + + x += 32; } - - x += 32; } // Scalar tail — remaining < 32 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -357,7 +364,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -383,86 +394,88 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. - let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); - let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); - let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); - - // 32-pixel u16 store via the shared 32-pixel writers. - if ALPHA { - let alpha_u16 = _mm_set1_epi16(out_max); - write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm512_set1_epi16(out_max); + let zero_v = _mm512_set1_epi16(0); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 32 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. + let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); + let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); + let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); + + // 32-pixel u16 store via the shared 32-pixel writers. + if ALPHA { + let alpha_u16 = _mm_set1_epi16(out_max); + write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3)); + } + + x += 32; } - - x += 32; } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -488,7 +501,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -505,38 +518,40 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let zero = _mm512_setzero_si512(); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 32 <= width { - // Load 64 u16 = 32 pixels and pull just the Y lanes via the - // cross-vector u16 permute. We don't need chroma here. - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const - // count, so it works without runtime-count helper. - let y_shr = _mm512_srli_epi16::<8>(y_raw); - // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack - // fixup); next 32 zero from the zero-hi pack source. - let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup); - // Store first 32 bytes via the low 256-bit half. - _mm256_storeu_si256( - luma_out.as_mut_ptr().add(x).cast(), - _mm512_castsi512_si256(y_u8), - ); - x += 32; + if !BE { + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let zero = _mm512_setzero_si512(); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 32 <= width { + // Load 64 u16 = 32 pixels and pull just the Y lanes via the + // cross-vector u16 permute. We don't need chroma here. + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const + // count, so it works without runtime-count helper. + let y_shr = _mm512_srli_epi16::<8>(y_raw); + // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack + // fixup); next 32 zero from the zero-hi pack source. + let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup); + // Store first 32 bytes via the low 256-bit half. + _mm256_storeu_si256( + luma_out.as_mut_ptr().add(x).cast(), + _mm512_castsi512_si256(y_u8), + ); + x += 32; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -554,7 +569,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -571,26 +586,28 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 32 <= width { - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into - // low-bit-packed form for the native-depth u16 output. - let y_low = _mm512_srl_epi16(y_raw, shr_count); - _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 32; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 32 <= width { + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into + // low-bit-packed form for the native-depth u16 output. + let y_low = _mm512_srl_epi16(y_raw, shr_count); + _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 32; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/tests/v210.rs b/src/row/arch/x86_sse41/tests/v210.rs index 6f1b9480..dea42837 100644 --- a/src/row/arch/x86_sse41/tests/v210.rs +++ b/src/row/arch/x86_sse41/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v210→luma u16 diverges (width={width})"); } @@ -234,7 +234,7 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "sse4.1 v210 luma reorder bug"); @@ -243,9 +243,15 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_sse41/tests/y216.rs b/src/row/arch/x86_sse41/tests/y216.rs index ebe59115..48e7acf8 100644 --- a/src/row/arch/x86_sse41/tests/y216.rs +++ b/src/row/arch/x86_sse41/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -32,9 +32,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -48,9 +48,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y216→luma diverges (width={width})"); } @@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y216→luma u16 diverges (width={width})"); } @@ -166,7 +166,7 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "SSE4.1 y216 luma_u16 reorder bug"); @@ -175,9 +175,15 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_sse41/tests/y2xx.rs b/src/row/arch/x86_sse41/tests/y2xx.rs index 1c97b77c..fe0e5cf7 100644 --- a/src/row/arch/x86_sse41/tests/y2xx.rs +++ b/src/row/arch/x86_sse41/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -107,9 +107,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -121,9 +121,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -135,9 +135,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -149,9 +151,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -163,9 +167,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -174,9 +178,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -264,15 +268,15 @@ fn sse41_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "SSE4.1 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -280,7 +284,7 @@ fn sse41_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -295,17 +299,17 @@ fn sse41_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "SSE4.1 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "SSE4.1 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_sse41/v210.rs b/src/row/arch/x86_sse41/v210.rs index cc11438d..eb37f5b8 100644 --- a/src/row/arch/x86_sse41/v210.rs +++ b/src/row/arch/x86_sse41/v210.rs @@ -14,7 +14,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks one 16-byte v210 word into three `__m128i` vectors holding @@ -42,11 +42,11 @@ use crate::{ColorMatrix, row::scalar}; /// `_mm_shuffle_epi8`). #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) { +unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) { // SAFETY: caller obligation — `ptr` has 16 bytes readable; SSE4.1 // (and thus SSSE3) is available. unsafe { - let words = _mm_loadu_si128(ptr.cast()); + let words = load_endian_u32x4::(ptr); let mask10 = _mm_set1_epi32(0x3FF); let low10 = _mm_and_si128(words, mask10); let mid10 = _mm_and_si128(_mm_srli_epi32::<10>(words), mask10); @@ -143,7 +143,7 @@ unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -180,7 +180,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = _mm_set1_epi32(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; @@ -263,7 +263,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -280,7 +286,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -317,7 +323,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = _mm_set1_epi32(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; let u_i16 = _mm_sub_epi16(u_vec, bias_v); @@ -383,7 +389,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -406,7 +412,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -416,7 +426,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8 via packus. let y_shr = _mm_srli_epi16::<2>(y_vec); let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); @@ -430,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -447,7 +457,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -457,7 +471,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_vec); @@ -468,7 +482,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/y216.rs b/src/row/arch/x86_sse41/y216.rs index a98cdc45..e799caee 100644 --- a/src/row/arch/x86_sse41/y216.rs +++ b/src/row/arch/x86_sse41/y216.rs @@ -48,7 +48,7 @@ use crate::{ColorMatrix, row::scalar}; /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -65,160 +65,168 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = _mm_set1_epi32(RND); - // Y216 samples are full u16 [0..65535]; use i32 y_off and - // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767. - let y_off_v = _mm_set1_epi32(y_off); - let y_scale_v = _mm_set1_epi32(y_scale); - let c_scale_v = _mm_set1_epi32(c_scale); - // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000. - let bias16_v = _mm_set1_epi16(-32768i16); - let cru = _mm_set1_epi32(coeffs.r_u()); - let crv = _mm_set1_epi32(coeffs.r_v()); - let cgu = _mm_set1_epi32(coeffs.g_u()); - let cgv = _mm_set1_epi32(coeffs.g_v()); - let cbu = _mm_set1_epi32(coeffs.b_u()); - let cbv = _mm_set1_epi32(coeffs.b_v()); - let alpha_u8 = _mm_set1_epi8(-1); - - // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each). - // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes. - // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes): - // 0,1 = Y0 2,3 = U0 4,5 = Y1 6,7 = V0 - // 8,9 = Y2 10,11 = U1 12,13 = Y3 14,15 = V1 - // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes. - let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - // U lanes from interleaved [U,V,U,V,...]: even u16 lanes. - let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - // V lanes: odd u16 lanes. - let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------ - // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3 - // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7 - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - - // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi. - let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8 - let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0] - let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8 - - // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group. - let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0] - let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0] - let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3] - - // Split U and V (4 valid low-half lanes each). - let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8 - let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8 - - // Center UV: subtract 32768 wrapping. - let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v); - let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v); - - // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale. - let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3] - let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3] - // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are - // 0x8080 garbage from the -1-byte shuffles, but we don't use them. - // Widen the high half too for `chroma_i16x8` (don't-care input). - let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16)); - let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v)); - - // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3, - // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid. - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - - // Duplicate each chroma sample into its Y-pair slot (4:2:2): - // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3] - let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo); - let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo); - let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo); - - // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v); - - // Saturating add and narrow to u8. - let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128()); - let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128()); - let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128()); - - // --- hi group: pixels x+8..x+15 --------------------------------- - let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - - let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx); - let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx); - let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - - let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx); - let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx); - let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half); - - let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx); - let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx); - - let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v); - let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v); - - let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16); - let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16)); - let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16)); - - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v)); - - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - - let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi); - let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi); - let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi); - - let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v); - - let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128()); - let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128()); - let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128()); - - // Combine two 8-pixel groups into 16-pixel output. - // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes. - // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8. - let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8); - let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8); - let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8); - - if ALPHA { - write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm_set1_epi32(RND); + // Y216 samples are full u16 [0..65535]; use i32 y_off and + // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767. + let y_off_v = _mm_set1_epi32(y_off); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000. + let bias16_v = _mm_set1_epi16(-32768i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); + + // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each). + // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes. + // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes): + // 0,1 = Y0 2,3 = U0 4,5 = Y1 6,7 = V0 + // 8,9 = Y2 10,11 = U1 12,13 = Y3 14,15 = V1 + // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes. + let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + // U lanes from interleaved [U,V,U,V,...]: even u16 lanes. + let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + // V lanes: odd u16 lanes. + let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 16 <= width { + // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------ + // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3 + // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7 + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + + // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi. + let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8 + let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0] + let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8 + + // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group. + let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0] + let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0] + let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3] + + // Split U and V (4 valid low-half lanes each). + let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8 + let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8 + + // Center UV: subtract 32768 wrapping. + let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v); + let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v); + + // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale. + let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3] + let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3] + // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are + // 0x8080 garbage from the -1-byte shuffles, but we don't use them. + // Widen the high half too for `chroma_i16x8` (don't-care input). + let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16)); + let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v)); + + // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3, + // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid. + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + + // Duplicate each chroma sample into its Y-pair slot (4:2:2): + // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3] + let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo); + + // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v); + + // Saturating add and narrow to u8. + let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128()); + let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128()); + let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128()); + + // --- hi group: pixels x+8..x+15 --------------------------------- + let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + + let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx); + let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx); + let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + + let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx); + let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx); + let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half); + + let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx); + let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx); + + let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v); + let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v); + + let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16); + let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16)); + let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16)); + + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v)); + + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + + let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi); + let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi); + let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi); + + let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v); + + let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128()); + let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128()); + let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128()); + + // Combine two 8-pixel groups into 16-pixel output. + // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes. + // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8. + let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8); + let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8); + let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8); + + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + + x += 16; } + } // end if !BE - x += 16; - } - - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -241,7 +249,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -258,147 +266,149 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND: i64 = 1 << 14; unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_v = _mm_set1_epi64x(RND); - let rnd32_v = _mm_set1_epi32(1 << 14); - let y_off_v = _mm_set1_epi32(y_off); - let y_scale_v = _mm_set1_epi32(y_scale); - let c_scale_v = _mm_set1_epi32(c_scale); - // bias 32768 via wrapping i16 trick - let bias16_v = _mm_set1_epi16(-32768i16); - let cru = _mm_set1_epi32(coeffs.r_u()); - let crv = _mm_set1_epi32(coeffs.r_v()); - let cgu = _mm_set1_epi32(coeffs.g_u()); - let cgv = _mm_set1_epi32(coeffs.g_v()); - let cbu = _mm_set1_epi32(coeffs.b_u()); - let cbv = _mm_set1_epi32(coeffs.b_v()); - - // Byte-level shuffle masks (same as u8 path). - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - // Two 128-bit loads: each covers 8 u16 = 4 pixels. - // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1] - // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3] - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - - // Y: [Y0..Y7] u16x8 - let y_lo_half = _mm_shuffle_epi8(lo, y_idx); - let y_hi_half = _mm_shuffle_epi8(hi, y_idx); - let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); - - // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3] - let c_lo_half = _mm_shuffle_epi8(lo, c_idx); - let c_hi_half = _mm_shuffle_epi8(hi, c_idx); - let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half); - - // U and V (4 valid low-half lanes each) - let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0] - let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0] - - // Center UV via wrapping i16 subtraction. - let u_i16 = _mm_sub_epi16(u_vec4, bias16_v); - let v_i16 = _mm_sub_epi16(v_vec4, bias16_v); - - // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16). - let u_i32 = _mm_cvtepi16_epi32(u_i16); - let v_i32 = _mm_cvtepi16_epi32(v_i16); - let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v)); - let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v)); - - // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes. - let u_d_even = u_d; - let v_d_even = v_d; - let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even - let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v); - let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v); - let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v); - let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); - let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v); - let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); - - // Reassemble i64x2 pairs (even + odd) → i32x4. - let r_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(r_ch_even, r_ch_odd), - _mm_unpackhi_epi32(r_ch_even, r_ch_odd), - ); - let g_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(g_ch_even, g_ch_odd), - _mm_unpackhi_epi32(g_ch_even, g_ch_odd), - ); - let b_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(b_ch_even, b_ch_odd), - _mm_unpackhi_epi32(b_ch_even, b_ch_odd), - ); - - // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2). - // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3) - // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7) - let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32); - let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32); - let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32); - let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32); - let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32); - let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32); - - // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. - let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32 - let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7] - let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v); - let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v); - - // Even/odd split for _mm_mul_epi32. - let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v); - let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v); - let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v); - let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v); - - // Reassemble Y i64x2 pairs to i32x4. - let y_lo_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(y_lo_even, y_lo_odd), - _mm_unpackhi_epi32(y_lo_even, y_lo_odd), - ); - let y_hi_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(y_hi_even, y_hi_odd), - _mm_unpackhi_epi32(y_hi_even, y_hi_odd), - ); - - // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32. - let r_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, r_dup_lo), - _mm_add_epi32(y_hi_i32, r_dup_hi), - ); - let g_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, g_dup_lo), - _mm_add_epi32(y_hi_i32, g_dup_hi), - ); - let b_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, b_dup_lo), - _mm_add_epi32(y_hi_i32, b_dup_hi), - ); - - if ALPHA { - write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_v = _mm_set1_epi64x(RND); + let rnd32_v = _mm_set1_epi32(1 << 14); + let y_off_v = _mm_set1_epi32(y_off); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + // bias 32768 via wrapping i16 trick + let bias16_v = _mm_set1_epi16(-32768i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + // Byte-level shuffle masks (same as u8 path). + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + // Two 128-bit loads: each covers 8 u16 = 4 pixels. + // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1] + // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3] + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + + // Y: [Y0..Y7] u16x8 + let y_lo_half = _mm_shuffle_epi8(lo, y_idx); + let y_hi_half = _mm_shuffle_epi8(hi, y_idx); + let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); + + // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3] + let c_lo_half = _mm_shuffle_epi8(lo, c_idx); + let c_hi_half = _mm_shuffle_epi8(hi, c_idx); + let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half); + + // U and V (4 valid low-half lanes each) + let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0] + let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0] + + // Center UV via wrapping i16 subtraction. + let u_i16 = _mm_sub_epi16(u_vec4, bias16_v); + let v_i16 = _mm_sub_epi16(v_vec4, bias16_v); + + // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16). + let u_i32 = _mm_cvtepi16_epi32(u_i16); + let v_i32 = _mm_cvtepi16_epi32(v_i16); + let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v)); + let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v)); + + // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes. + let u_d_even = u_d; + let v_d_even = v_d; + let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even + let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v); + let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v); + let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v); + let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); + let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v); + let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); + + // Reassemble i64x2 pairs (even + odd) → i32x4. + let r_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(r_ch_even, r_ch_odd), + _mm_unpackhi_epi32(r_ch_even, r_ch_odd), + ); + let g_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(g_ch_even, g_ch_odd), + _mm_unpackhi_epi32(g_ch_even, g_ch_odd), + ); + let b_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(b_ch_even, b_ch_odd), + _mm_unpackhi_epi32(b_ch_even, b_ch_odd), + ); + + // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2). + // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3) + // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7) + let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32); + let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32); + let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32); + let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32); + let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32); + let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32); + + // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. + let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32 + let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7] + let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v); + let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v); + + // Even/odd split for _mm_mul_epi32. + let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v); + let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v); + let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v); + let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v); + + // Reassemble Y i64x2 pairs to i32x4. + let y_lo_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(y_lo_even, y_lo_odd), + _mm_unpackhi_epi32(y_lo_even, y_lo_odd), + ); + let y_hi_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(y_hi_even, y_hi_odd), + _mm_unpackhi_epi32(y_hi_even, y_hi_odd), + ); + + // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32. + let r_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, r_dup_lo), + _mm_add_epi32(y_hi_i32, r_dup_hi), + ); + let g_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, g_dup_lo), + _mm_add_epi32(y_hi_i32, g_dup_hi), + ); + let b_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, b_dup_lo), + _mm_add_epi32(y_hi_i32, b_dup_hi), + ); + + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + + x += 8; } + } // end if !BE - x += 8; - } - - // Scalar tail — remaining < 8 pixels. + // Scalar tail — remaining < 8 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -423,49 +433,55 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { - // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - // Four loads covering 16 pixels (16 u16 per load pair). - // packed offset x*2 = quadruple-base for pixel x. - // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15. - let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - - // Extract Y lanes into u16x8. - let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..] - let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..] - let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] - - let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..] - let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..] - let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - - // `>> 8` to get u8 luma (high byte of each Y sample). - let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo); - let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi); - // Pack 16 × i16 → 16 × u8. - let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr); - _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 16; + if !BE { + // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 16 <= width { + // Four loads covering 16 pixels (16 u16 per load pair). + // packed offset x*2 = quadruple-base for pixel x. + // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15. + let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + + // Extract Y lanes into u16x8. + let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..] + let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..] + let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] + + let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..] + let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..] + let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + + // `>> 8` to get u8 luma (high byte of each Y sample). + let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo); + let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi); + // Pack 16 × i16 → 16 × u8. + let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr); + _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -484,41 +500,47 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + if !BE { + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); - let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); - let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] + while x + 16 <= width { + let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); - let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); - let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); + let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); + let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] - // Direct copy — full 16-bit Y values, no shift. - _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo); - _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi); + let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); + let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); + let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - x += 16; + // Direct copy — full 16-bit Y values, no shift. + _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo); + _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/y2xx.rs b/src/row/arch/x86_sse41/y2xx.rs index eaa88f7e..e8e18aff 100644 --- a/src/row/arch/x86_sse41/y2xx.rs +++ b/src/row/arch/x86_sse41/y2xx.rs @@ -130,7 +130,11 @@ unsafe fn unpack_y2xx_8px_sse41( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -158,111 +162,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - - // 8-lane chroma vectors with valid data in lanes 0..3. - let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via - // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with - // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3]. - let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); - let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); - let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits - // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero - // for hi) so the low 8 bytes of the result hold the saturated - // u8 of the input i16x8. Only the first 8 bytes per channel - // matter. - let zero = _mm_setzero_si128(); - let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero); - let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero); - let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero); - - // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16` - // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter - // body we use the v210-style stack-buffer + scalar interleave - // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.) - let mut r_tmp = [0u8; 16]; - let mut g_tmp = [0u8; 16]; - let mut b_tmp = [0u8; 16]; - _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8); - _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8); - _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 8 * 4]; - for i in 0..8 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 8 * 3]; - for i in 0..8 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm_srl_epi16`, see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + // Widen 8-lane i16 chroma to two i32x4 halves so the Q15 + // multiplies don't overflow. Only lanes 0..3 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x8` to recycle the helper exactly; the + // don't-care output lanes are discarded by the + // `_mm_unpacklo_epi16` duplicate step below (which only consumes + // lanes 0..3). + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + // 8-lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with + // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3]. + let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); + let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); + let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits + // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero + // for hi) so the low 8 bytes of the result hold the saturated + // u8 of the input i16x8. Only the first 8 bytes per channel + // matter. + let zero = _mm_setzero_si128(); + let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero); + let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero); + let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero); + + // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16` + // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter + // body we use the v210-style stack-buffer + scalar interleave + // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.) + let mut r_tmp = [0u8; 16]; + let mut g_tmp = [0u8; 16]; + let mut b_tmp = [0u8; 16]; + _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8); + _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8); + _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 8 * 4]; + for i in 0..8 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 8 * 3]; + for i in 0..8 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 8; - } + x += 8; + } + } // end if !BE - // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // Scalar tail — remaining < 8 pixels (always even per 4:2:2), + // or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -288,7 +295,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -314,72 +325,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - - let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); - let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); - let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. - // `_mm_adds_epi16` saturates at i16 bounds (no-op here since - // |sum| stays well inside i16 for BITS ≤ 12), then min/max - // clamps to the BITS range. - let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v); - let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v); - let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v); - - if ALPHA { - let alpha = _mm_set1_epi16(out_max); - write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3)); - } + if !BE { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm_set1_epi16(out_max); + let zero_v = _mm_set1_epi16(0); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); + let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); + let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. + // `_mm_adds_epi16` saturates at i16 bounds (no-op here since + // |sum| stays well inside i16 for BITS ≤ 12), then min/max + // clamps to the BITS range. + let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v); + let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v); + let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v); + + if ALPHA { + let alpha = _mm_set1_epi16(out_max); + write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3)); + } - x += 8; - } + x += 8; + } + } // end if !BE if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -405,7 +418,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -422,39 +435,41 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Y permute mask: pick even u16 lanes (low byte at [0], high byte - // at [1]) into the low 8 bytes; high 8 bytes zeroed. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _] - let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _] - let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>`. - // `_mm_srli_epi16::<8>` has a literal const count, so it works - // here without the runtime-count helper. - let y_shr = _mm_srli_epi16::<8>(y_vec); - // Pack 8 i16 lanes to u8 — only low 8 bytes used. - let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); - // Store low 8 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 16]; - _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 8].copy_from_slice(&tmp[..8]); - - x += 8; + if !BE { + // Y permute mask: pick even u16 lanes (low byte at [0], high byte + // at [1]) into the low 8 bytes; high 8 bytes zeroed. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _] + let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _] + let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>`. + // `_mm_srli_epi16::<8>` has a literal const count, so it works + // here without the runtime-count helper. + let y_shr = _mm_srli_epi16::<8>(y_vec); + // Pack 8 i16 lanes to u8 — only low 8 bytes used. + let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); + // Store low 8 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 16]; + _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 8].copy_from_slice(&tmp[..8]); + + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -471,7 +486,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -488,28 +503,30 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = _mm_shuffle_epi8(lo, y_idx); - let y_hi = _mm_shuffle_epi8(hi, y_idx); - let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = _mm_srl_epi16(y_vec, shr_count); - _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 8; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = _mm_shuffle_epi8(lo, y_idx); + let y_hi = _mm_shuffle_epi8(hi, y_idx); + let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = _mm_srl_epi16(y_vec, shr_count); + _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/dispatch/v210.rs b/src/row/dispatch/v210.rs index 2760c4b2..7d1c14b1 100644 --- a/src/row/dispatch/v210.rs +++ b/src/row/dispatch/v210.rs @@ -7,8 +7,8 @@ //! block; `use_simd = false` forces scalar. //! //! The per-format SIMD kernels are const-generic on `ALPHA` -//! (`v210_to_rgb_or_rgba_row::` / -//! `v210_to_rgb_u16_or_rgba_u16_row::`) — the public +//! (`v210_to_rgb_or_rgba_row::` / +//! `v210_to_rgb_u16_or_rgba_u16_row::`) — the public //! dispatchers split them into RGB vs. RGBA entries by hard-wiring //! `ALPHA = false` / `true`. @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of v210 to packed RGB (u8). See /// [`scalar::v210_to_rgb_or_rgba_row`] for byte layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (32-bit words stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn v210_to_rgb_row( packed: &[u8], @@ -40,6 +41,7 @@ pub fn v210_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn v210_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn v210_to_rgb_row( } } - scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of v210 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn v210_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn v210_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn v210_to_rgba_row( } } - scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of v210 to packed `u16` RGB at native 10-bit @@ -167,6 +218,7 @@ pub fn v210_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn v210_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn v210_to_rgb_u16_row( } } - scalar::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of v210 to packed `u16` RGBA at native 10-bit @@ -231,6 +311,7 @@ pub fn v210_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn v210_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn v210_to_rgba_u16_row( } } - scalar::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed v210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "v210 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::neon::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::v210_to_luma_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::v210_to_luma_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si } } - scalar::v210_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::v210_to_luma_row::(packed, luma_out, width), + scalar::v210_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed v210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "v210 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::neon::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u } } - scalar::v210_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::v210_to_luma_u16_row::(packed, luma_out, width), + scalar::v210_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -435,7 +604,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 6 * 3]; - v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false); + v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -444,7 +613,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 6 * 4]; - v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false); + v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -452,7 +621,15 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 6 * 3]; - v210_to_rgb_u16_row(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true, false); + v210_to_rgb_u16_row( + &word, + &mut rgb_u16, + 6, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -461,21 +638,29 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 6 * 4]; - v210_to_rgba_u16_row(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true, false); + v210_to_rgba_u16_row( + &word, + &mut rgba_u16, + 6, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 6]; - v210_to_luma_row(&word, &mut luma, 6, false); + v210_to_luma_row(&word, &mut luma, 6, false, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 6]; - v210_to_luma_u16_row(&word, &mut luma_u16, 6, false); + v210_to_luma_u16_row(&word, &mut luma_u16, 6, false, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y210.rs b/src/row/dispatch/y210.rs index e9ab9eca..97bd0766 100644 --- a/src/row/dispatch/y210.rs +++ b/src/row/dispatch/y210.rs @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of Y210 to packed RGB (u8). See /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y210_to_rgb_row( packed: &[u16], @@ -40,6 +41,7 @@ pub fn y210_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn y210_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn y210_to_rgb_row( } } - scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y210 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn y210_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn y210_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn y210_to_rgba_row( } } - scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y210 to packed `u16` RGB at native 10-bit @@ -167,6 +218,7 @@ pub fn y210_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn y210_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn y210_to_rgb_u16_row( } } - scalar::y210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y210 to packed `u16` RGBA at native 10-bit @@ -231,6 +311,7 @@ pub fn y210_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn y210_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn y210_to_rgba_u16_row( } } - scalar::y210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y210_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y210 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y210_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y210_to_luma_row::(packed, luma_out, width), + scalar::y210_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y210_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y210 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y210_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y210_to_luma_u16_row::(packed, luma_out, width), + scalar::y210_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -433,7 +602,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -442,7 +611,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -450,7 +619,15 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y210_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y210_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -459,21 +636,29 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 8 * 4]; - y210_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y210_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 8]; - y210_to_luma_row(&buf, &mut luma, 8, false); + y210_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 8]; - y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y212.rs b/src/row/dispatch/y212.rs index aa253721..2245c50e 100644 --- a/src/row/dispatch/y212.rs +++ b/src/row/dispatch/y212.rs @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of Y212 to packed RGB (u8). See /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y212_to_rgb_row( packed: &[u16], @@ -40,6 +41,7 @@ pub fn y212_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn y212_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn y212_to_rgb_row( } } - scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y212 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn y212_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn y212_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn y212_to_rgba_row( } } - scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y212 to packed `u16` RGB at native 12-bit @@ -167,6 +218,7 @@ pub fn y212_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn y212_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn y212_to_rgb_u16_row( } } - scalar::y212_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y212 to packed `u16` RGBA at native 12-bit @@ -231,6 +311,7 @@ pub fn y212_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn y212_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn y212_to_rgba_u16_row( } } - scalar::y212_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y212 buffer. /// Y values are downshifted from 12-bit to 8-bit via `>> 4`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y212_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y212 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y212_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y212_to_luma_row::(packed, luma_out, width), + scalar::y212_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y212 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in /// its low 12 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y212_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y212 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y212_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y212_to_luma_u16_row::(packed, luma_out, width), + scalar::y212_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -433,7 +602,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -442,7 +611,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -450,7 +619,15 @@ mod tests { // u16 RGB at native 12-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y212_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y212_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(2048) <= 2); assert_eq!(px[0], px[1]); @@ -459,21 +636,29 @@ mod tests { // u16 RGBA — alpha = 4095. let mut rgba_u16 = [0u16; 8 * 4]; - y212_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y212_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 4095); } // u8 luma — Y=2048 → 128 after `>> 4`. let mut luma = [0u8; 8]; - y212_to_luma_row(&buf, &mut luma, 8, false); + y212_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (2048u16 >> 4) as u8); } // u16 luma — low-packed 12-bit Y. let mut luma_u16 = [0u16; 8]; - y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 2048); } diff --git a/src/row/dispatch/y216.rs b/src/row/dispatch/y216.rs index 9f0fc6de..541022c7 100644 --- a/src/row/dispatch/y216.rs +++ b/src/row/dispatch/y216.rs @@ -30,7 +30,8 @@ use crate::{ /// Converts one row of Y216 to packed RGB (u8). See /// [`scalar::y216_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y216_to_rgb_row( packed: &[u16], @@ -39,6 +40,7 @@ pub fn y216_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -53,36 +55,57 @@ pub fn y216_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -90,7 +113,10 @@ pub fn y216_to_rgb_row( } } - scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y216 to packed RGBA (u8) with `α = 0xFF`. @@ -102,6 +128,7 @@ pub fn y216_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -116,36 +143,57 @@ pub fn y216_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -153,7 +201,10 @@ pub fn y216_to_rgba_row( } } - scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y216 to packed `u16` RGB at native 16-bit @@ -166,6 +217,7 @@ pub fn y216_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -180,36 +232,57 @@ pub fn y216_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -217,7 +290,14 @@ pub fn y216_to_rgb_u16_row( } } - scalar::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y216 to packed `u16` RGBA at native 16-bit @@ -230,6 +310,7 @@ pub fn y216_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -244,36 +325,57 @@ pub fn y216_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -281,13 +383,26 @@ pub fn y216_to_rgba_u16_row( } } - scalar::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y216 buffer. /// Y values are downshifted from 16-bit to 8-bit via `>> 8`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y216_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y216 requires even width (4:2:2 chroma pair)" @@ -298,36 +413,57 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::neon::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y216_to_luma_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y216_to_luma_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, @@ -335,13 +471,22 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y216_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y216_to_luma_row::(packed, luma_out, width), + scalar::y216_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y216 /// buffer (full-range: each `u16` carries the 16-bit Y value directly). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y216_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y216 requires even width (4:2:2 chroma pair)" @@ -352,36 +497,57 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::neon::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, @@ -389,7 +555,10 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y216_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y216_to_luma_u16_row::(packed, luma_out, width), + scalar::y216_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -431,7 +600,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -440,7 +609,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -448,7 +617,15 @@ mod tests { // u16 RGB at native 16-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y216_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y216_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(32768) <= 4); assert_eq!(px[0], px[1]); @@ -457,21 +634,29 @@ mod tests { // u16 RGBA — alpha = 0xFFFF. let mut rgba_u16 = [0u16; 8 * 4]; - y216_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y216_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 0xFFFF); } // u8 luma — Y=32768 → 128 after `>> 8`. let mut luma = [0u8; 8]; - y216_to_luma_row(&buf, &mut luma, 8, false); + y216_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (32768u16 >> 8) as u8); } // u16 luma — full 16-bit Y value. let mut luma_u16 = [0u16; 8]; - y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 32768); } @@ -483,7 +668,7 @@ mod tests { // packed buffer has only 2 elements for width=4 (needs 8). let packed = [0u16; 2]; let mut rgb = [0u8; 4 * 3]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -492,7 +677,7 @@ mod tests { // output buffer has only 2 bytes for width=4 (needs 12). let packed = [0u16; 8]; let mut rgb = [0u8; 2]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -500,7 +685,7 @@ mod tests { fn y216_dispatcher_rejects_odd_width() { let packed = [0u16; 6]; let mut rgb = [0u8; 9]; - y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false, false); } #[test] @@ -521,6 +706,7 @@ mod tests { ColorMatrix::Bt709, true, false, + false, ); } } diff --git a/src/row/mod.rs b/src/row/mod.rs index 297f1c3c..93feab94 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -928,7 +928,15 @@ mod overflow_tests { let candidate = ((usize::MAX / 16) + 1) * 6; let p: [u8; 0] = []; let mut rgb: [u8; 0] = []; - v210_to_rgb_row(&p, &mut rgb, candidate, ColorMatrix::Bt601, true, false); + v210_to_rgb_row( + &p, + &mut rgb, + candidate, + ColorMatrix::Bt601, + true, + false, + false, + ); } // ---- Y2xx dispatcher — `width × 2` overflow ---- @@ -958,6 +966,7 @@ mod overflow_tests { ColorMatrix::Bt601, true, false, + false, ); } } diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs index b0e390ee..5c8cb66c 100644 --- a/src/row/scalar/mod.rs +++ b/src/row/scalar/mod.rs @@ -123,6 +123,59 @@ pub(crate) use yuv_planar_high_bit::*; // ---- Shared scalar helpers (used across all conversion families) ------- +/// Reads one `u16` from the byte address `ptr` in the endianness +/// indicated by `BE`. `BE = false` → little-endian (native v210/Y2xx +/// on-wire format); `BE = true` → big-endian. The unused branch is +/// eliminated by the compiler when the caller is monomorphized. +/// +/// **Target-endian aware** — this matches the SIMD `load_endian_u16x*` +/// helpers' semantics: `u16::from_be_bytes` / `u16::from_le_bytes` +/// each emit a `bswap` only when the source byte order differs from +/// the host CPU's native order. On a BE host the `BE = true` branch +/// is a plain load (no swap) and the `BE = false` branch swaps; on +/// an LE host the polarity reverses. This is the strict-superset-of- +/// bugs alternative to a naive `if BE { x.swap_bytes() }` pattern, +/// which would corrupt rows on s390x / other BE hosts. See +/// `fix(be-tier10b): make scalar BE conversion target-endian aware` +/// for the codex finding that motivated this contract crate-wide. +/// +/// # Safety +/// +/// `ptr` must point to at least 2 readable bytes. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) unsafe fn load_endian_u16(ptr: *const u8) -> u16 { + let bytes = unsafe { [*ptr, *ptr.add(1)] }; + if BE { + u16::from_be_bytes(bytes) + } else { + u16::from_le_bytes(bytes) + } +} + +/// Reads one `u32` from the byte address `ptr` in the endianness +/// indicated by `BE`. `BE = false` → little-endian; `BE = true` → +/// big-endian. The unused branch is eliminated by the compiler when +/// the caller is monomorphized. +/// +/// **Target-endian aware** — `u32::from_be_bytes` / `u32::from_le_bytes` +/// each emit a `bswap` only when the source byte order differs from +/// the host CPU's native order, matching the SIMD `load_endian_u32x*` +/// helpers. See [`load_endian_u16`] for the full target-endian +/// contract and the codex motivation. +/// +/// # Safety +/// +/// `ptr` must point to at least 4 readable bytes. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) unsafe fn load_endian_u32(ptr: *const u8) -> u32 { + let bytes = unsafe { [*ptr, *ptr.add(1), *ptr.add(2), *ptr.add(3)] }; + if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + } +} + #[cfg_attr(not(tarpaulin), inline(always))] pub(super) fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 diff --git a/src/row/scalar/v210.rs b/src/row/scalar/v210.rs index 00a4e029..1b9db248 100644 --- a/src/row/scalar/v210.rs +++ b/src/row/scalar/v210.rs @@ -10,6 +10,15 @@ //! word 2: `[Cr1, Y3, Cb2]` //! word 3: `[Y4, Cr2, Y5]` //! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each 32-bit word in the packed stream is +//! stored in big-endian byte order. `load_endian_u32::` handles +//! the conditional byte-swap at each u32 load site inside +//! `unpack_v210_word`; the `BE = false` path is identical to the +//! previous `u32::from_le_bytes` decode. The unused branch is +//! eliminated at monomorphization. +//! //! ## Partial-word support //! //! Real captures (e.g. 720p = 1280 wide) commonly end on a partial @@ -32,14 +41,16 @@ use super::*; /// Extracts 6 Y + 3 U + 3 V 10-bit samples from one 16-byte v210 /// word. Output samples are 10-bit values in the low 10 bits of -/// each `u16`. +/// each `u16`. `BE = true` reads each 32-bit word in big-endian +/// byte order. #[cfg_attr(not(tarpaulin), inline(always))] -fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { +fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { debug_assert_eq!(word.len(), 16); - let w0 = u32::from_le_bytes([word[0], word[1], word[2], word[3]]); - let w1 = u32::from_le_bytes([word[4], word[5], word[6], word[7]]); - let w2 = u32::from_le_bytes([word[8], word[9], word[10], word[11]]); - let w3 = u32::from_le_bytes([word[12], word[13], word[14], word[15]]); + // SAFETY: word has exactly 16 bytes (checked above); each offset is ≤ 12. + let w0 = unsafe { load_endian_u32::(word.as_ptr()) }; + let w1 = unsafe { load_endian_u32::(word.as_ptr().add(4)) }; + let w2 = unsafe { load_endian_u32::(word.as_ptr().add(8)) }; + let w3 = unsafe { load_endian_u32::(word.as_ptr().add(12)) }; // Word 0: [Cb0, Y0, Cr0] let cb0 = (w0 & 0x3FF) as u16; @@ -70,14 +81,14 @@ fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { /// /// Supports any **even** `width`: complete 6-px words run the full /// loop; a final partial word emits 2 or 4 pixels from its valid -/// chroma-pair prefix. +/// chroma-pair prefix. `BE = true` selects big-endian u32 word decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_rgb_or_rgba_row( +pub(crate) fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -101,7 +112,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row( for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); // 6 pixels per word; each chroma pair (U[i], V[i]) covers // Y[2i] and Y[2i+1]. @@ -135,7 +146,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row( // pairs are valid (1 pair for 2 px; 2 pairs for 4 px). let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); let pairs = tail_pixels / 2; for i in 0..pairs { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -172,14 +183,15 @@ pub(crate) fn v210_to_rgb_or_rgba_row( /// `(1 << 10) - 1 = 1023` (opaque maximum at 10-bit). /// /// Supports any **even** `width`: see [`v210_to_rgb_or_rgba_row`] -/// for partial-word semantics. +/// for partial-word semantics. `BE = true` selects big-endian u32 word +/// decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -204,7 +216,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); for i in 0..3 { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -232,7 +244,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); let pairs = tail_pixels / 2; for i in 0..pairs { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -262,13 +274,14 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( /// Scalar v210 → 8-bit luma. Y values are downshifted from 10-bit /// to 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely. +/// `BE = true` selects big-endian u32 word decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); debug_assert!(packed.len() >= total_words * 16, "packed row too short"); @@ -279,7 +292,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); for k in 0..6 { luma_out[w * 6 + k] = (ys[k] >> 2) as u8; } @@ -287,7 +300,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); for k in 0..tail_pixels { luma_out[w * 6 + k] = (ys[k] >> 2) as u8; } @@ -296,14 +309,19 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) /// Scalar v210 → native-depth `u16` luma (low-bit-packed). Each /// output `u16` carries the source's 10-bit Y value in its low 10 -/// bits (upper 6 bits zero). +/// bits (upper 6 bits zero). `BE = true` selects big-endian u32 word +/// decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); debug_assert!(packed.len() >= total_words * 16, "packed row too short"); @@ -314,13 +332,13 @@ pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: u for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); luma_out[w * 6..w * 6 + 6].copy_from_slice(&ys); } if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); luma_out[w * 6..w * 6 + tail_pixels].copy_from_slice(&ys[..tail_pixels]); } } @@ -358,12 +376,34 @@ mod tests { out } + /// Pack a v210 word using big-endian u32 encoding (each 32-bit word stored BE). + fn pack_v210_word_be(samples: [u16; 12]) -> [u8; 16] { + let mut out = [0u8; 16]; + let w0 = (samples[0] as u32 & 0x3FF) + | ((samples[1] as u32 & 0x3FF) << 10) + | ((samples[2] as u32 & 0x3FF) << 20); + let w1 = (samples[3] as u32 & 0x3FF) + | ((samples[4] as u32 & 0x3FF) << 10) + | ((samples[5] as u32 & 0x3FF) << 20); + let w2 = (samples[6] as u32 & 0x3FF) + | ((samples[7] as u32 & 0x3FF) << 10) + | ((samples[8] as u32 & 0x3FF) << 20); + let w3 = (samples[9] as u32 & 0x3FF) + | ((samples[10] as u32 & 0x3FF) << 10) + | ((samples[11] as u32 & 0x3FF) << 20); + out[0..4].copy_from_slice(&w0.to_be_bytes()); + out[4..8].copy_from_slice(&w1.to_be_bytes()); + out[8..12].copy_from_slice(&w2.to_be_bytes()); + out[12..16].copy_from_slice(&w3.to_be_bytes()); + out + } + #[test] fn scalar_v210_to_rgb_gray_is_gray() { // Full-range gray: Y=512, U=V=512 (10-bit center). let word = pack_v210_word([512; 12]); let mut rgb = [0u8; 6 * 3]; - v210_to_rgb_or_rgba_row::(&word, &mut rgb, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&word, &mut rgb, 6, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -375,7 +415,7 @@ mod tests { fn scalar_v210_to_rgba_gray_is_gray_with_opaque_alpha() { let word = pack_v210_word([512; 12]); let mut rgba = [0u8; 6 * 4]; - v210_to_rgb_or_rgba_row::(&word, &mut rgba, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&word, &mut rgba, 6, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -387,7 +427,13 @@ mod tests { // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023). let word = pack_v210_word([512; 12]); let mut rgb_u16 = [0u16; 6 * 3]; - v210_to_rgb_u16_or_rgba_u16_row::(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true); + v210_to_rgb_u16_or_rgba_u16_row::( + &word, + &mut rgb_u16, + 6, + ColorMatrix::Bt709, + true, + ); for px in rgb_u16.chunks(3) { // Gray luma at 512 / full-range produces RGB ~512 in 10-bit. assert!(px[0].abs_diff(512) <= 2); @@ -400,7 +446,13 @@ mod tests { fn scalar_v210_to_rgba_u16_alpha_is_max() { let word = pack_v210_word([512; 12]); let mut rgba_u16 = [0u16; 6 * 4]; - v210_to_rgb_u16_or_rgba_u16_row::(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true); + v210_to_rgb_u16_or_rgba_u16_row::( + &word, + &mut rgba_u16, + 6, + ColorMatrix::Bt709, + true, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1"); } @@ -413,7 +465,7 @@ mod tests { ]; let word = pack_v210_word(samples); let mut luma = [0u8; 6]; - v210_to_luma_row(&word, &mut luma, 6); + v210_to_luma_row::(&word, &mut luma, 6); // Y values: 200, 300, 400, 500, 600, 700 → 10-bit, downshift >> 2. assert_eq!(luma[0], (200u16 >> 2) as u8); assert_eq!(luma[1], (300u16 >> 2) as u8); @@ -428,7 +480,7 @@ mod tests { let samples = [100, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700]; let word = pack_v210_word(samples); let mut luma = [0u16; 6]; - v210_to_luma_u16_row(&word, &mut luma, 6); + v210_to_luma_u16_row::(&word, &mut luma, 6); assert_eq!(luma[0], 200); assert_eq!(luma[1], 300); assert_eq!(luma[2], 400); @@ -445,7 +497,7 @@ mod tests { packed.extend_from_slice(&pack_v210_word(samples)); packed.extend_from_slice(&pack_v210_word(samples)); let mut rgb = std::vec![0u8; 12 * 3]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgb, 12, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgb, 12, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); } @@ -468,19 +520,19 @@ mod tests { packed.extend_from_slice(&pack_v210_word([512; 12])); } let mut rgb = std::vec![0u8; width * 3]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgb, width, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgb, width, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1, "width={width}: gray RGB diverged"); assert_eq!(px[0], px[1]); } let mut rgba = std::vec![0u8; width * 4]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgba, width, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgba, width, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); } let mut rgb_u16 = std::vec![0u16; width * 3]; - v210_to_rgb_u16_or_rgba_u16_row::( + v210_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgb_u16, width, @@ -491,12 +543,12 @@ mod tests { assert!(px[0].abs_diff(512) <= 2); } let mut luma = std::vec![0u8; width]; - v210_to_luma_row(&packed, &mut luma, width); + v210_to_luma_row::(&packed, &mut luma, width); for &y in &luma { assert_eq!(y, 128); } let mut luma_u16 = std::vec![0u16; width]; - v210_to_luma_u16_row(&packed, &mut luma_u16, width); + v210_to_luma_u16_row::(&packed, &mut luma_u16, width); for &y in &luma_u16 { assert_eq!(y, 512); } @@ -558,8 +610,81 @@ mod tests { ]; let word = pack_v210_word(samples); let mut luma = [0u8; 2]; - v210_to_luma_row(&word, &mut luma, 2); + v210_to_luma_row::(&word, &mut luma, 2); assert_eq!(luma[0], (600u16 >> 2) as u8); assert_eq!(luma[1], (700u16 >> 2) as u8); } + + // ---- BE parity tests ----------------------------------------------- + // + // For each output type: pack the same samples in BE word encoding, + // run the BE=true path, assert identical output to the LE=false path. + + #[test] + fn scalar_v210_be_rgb_matches_le() { + let samples = [ + 100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_rgb = [0u8; 6 * 3]; + let mut be_rgb = [0u8; 6 * 3]; + v210_to_rgb_or_rgba_row::(&le_word, &mut le_rgb, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&be_word, &mut be_rgb, 6, ColorMatrix::Bt709, true); + assert_eq!(le_rgb, be_rgb, "BE rgb output must match LE"); + } + + #[test] + fn scalar_v210_be_rgb_u16_matches_le() { + let samples = [ + 100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_rgb = [0u16; 6 * 3]; + let mut be_rgb = [0u16; 6 * 3]; + v210_to_rgb_u16_or_rgba_u16_row::( + &le_word, + &mut le_rgb, + 6, + ColorMatrix::Bt709, + true, + ); + v210_to_rgb_u16_or_rgba_u16_row::( + &be_word, + &mut be_rgb, + 6, + ColorMatrix::Bt709, + true, + ); + assert_eq!(le_rgb, be_rgb, "BE rgb_u16 output must match LE"); + } + + #[test] + fn scalar_v210_be_luma_matches_le() { + let samples = [ + 100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_luma = [0u8; 6]; + let mut be_luma = [0u8; 6]; + v210_to_luma_row::(&le_word, &mut le_luma, 6); + v210_to_luma_row::(&be_word, &mut be_luma, 6); + assert_eq!(le_luma, be_luma, "BE luma output must match LE"); + } + + #[test] + fn scalar_v210_be_luma_u16_matches_le() { + let samples = [ + 100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_luma = [0u16; 6]; + let mut be_luma = [0u16; 6]; + v210_to_luma_u16_row::(&le_word, &mut le_luma, 6); + v210_to_luma_u16_row::(&be_word, &mut be_luma, 6); + assert_eq!(le_luma, be_luma, "BE luma_u16 output must match LE"); + } } diff --git a/src/row/scalar/y216.rs b/src/row/scalar/y216.rs index 088ec22e..291e8914 100644 --- a/src/row/scalar/y216.rs +++ b/src/row/scalar/y216.rs @@ -6,13 +6,21 @@ //! `src/row/scalar/yuv_planar_16bit.rs`'s i64 chroma scalar //! pattern but sourced from YUYV-shaped u16 quadruples rather //! than separate Y/U/V planes. +//! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each `u16` element in `packed` is stored in +//! big-endian byte order. `load_endian_u16::` handles the +//! conditional byte-swap at each sample site; the unused branch is +//! eliminated at monomorphization. use super::*; // ---- u8 RGB / RGBA output (i32 chroma — same as Y210/Y212) ------- +/// `BE = true` selects big-endian wire decoding for each u16 sample. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_rgb_or_rgba_row( +pub(crate) fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -29,13 +37,15 @@ pub(crate) fn y216_to_rgb_or_rgba_row( let bias = chroma_bias::<16>(); let pairs = width / 2; + // SAFETY: bounds validated above; off4 + 6 < packed.len() * 2 for p < pairs. + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; + let off4 = p * 4 * 2; // No right-shift: BITS=16 means samples are already full-width. - let y0 = q[0] as i32; - let u = q[1] as i32; - let y1 = q[2] as i32; - let v = q[3] as i32; + let y0 = unsafe { load_endian_u16::(base.add(off4)) } as i32; + let u = unsafe { load_endian_u16::(base.add(off4 + 2)) } as i32; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) } as i32; + let v = unsafe { load_endian_u16::(base.add(off4 + 6)) } as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -59,8 +69,9 @@ pub(crate) fn y216_to_rgb_or_rgba_row( // ---- u16 RGB / RGBA native-depth output (i64 chroma) ---------------- +/// `BE = true` selects big-endian wire decoding for each u16 sample. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -78,12 +89,13 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( let out_max: i32 = 0xFFFF; let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = q[0] as i32; - let u = q[1] as i32; - let y1 = q[2] as i32; - let v = q[3] as i32; + let off4 = p * 4 * 2; + let y0 = unsafe { load_endian_u16::(base.add(off4)) } as i32; + let u = unsafe { load_endian_u16::(base.add(off4 + 2)) } as i32; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) } as i32; + let v = unsafe { load_endian_u16::(base.add(off4 + 6)) } as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -107,31 +119,38 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( // ---- Luma (u8) — `>> 8` ---------------------------------------------- +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - out[p * 2] = (q[0] >> 8) as u8; - out[p * 2 + 1] = (q[2] >> 8) as u8; + let off4 = p * 4 * 2; + let y0 = unsafe { load_endian_u16::(base.add(off4)) }; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) }; + out[p * 2] = (y0 >> 8) as u8; + out[p * 2 + 1] = (y1 >> 8) as u8; } } // ---- Luma (u16, direct extract) --------------------------------------- +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - out[p * 2] = q[0]; // direct extract — full 16 bits, no shift - out[p * 2 + 1] = q[2]; + let off4 = p * 4 * 2; + // Direct extract — full 16 bits, no shift; byte-swap if BE. + out[p * 2] = unsafe { load_endian_u16::(base.add(off4)) }; + out[p * 2 + 1] = unsafe { load_endian_u16::(base.add(off4 + 4)) }; } } @@ -147,6 +166,11 @@ mod tests { [4096, 32768, 32000, 32768, 0, 16384, 65535, 49152] } + /// Byte-swap every u16 to produce the BE-encoded form. + fn to_be_u16(le: &[u16]) -> std::vec::Vec { + le.iter().map(|&v| v.swap_bytes()).collect() + } + /// u8 RGB output — hand-derived expected values for Bt709 limited range. /// /// Pair 0 (neutral chroma, U=V=32768=bias → u_d=v_d=0 → chroma=0): @@ -159,7 +183,7 @@ mod tests { fn y216_known_pattern_rgb() { let packed = test_input(); let mut out = [0u8; 4 * 3]; - y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); // Pixel 0: Y=4096 (limited-range black), neutral chroma → (0, 0, 0) assert_eq!(&out[0..3], &[0, 0, 0], "pixel 0 (Y=4096, neutral chroma)"); @@ -184,7 +208,7 @@ mod tests { fn y216_known_pattern_rgba() { let packed = test_input(); let mut out = [0u8; 4 * 4]; - y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]); assert_eq!(&out[4..8], &[127, 127, 127, 0xFF]); @@ -201,7 +225,13 @@ mod tests { fn y216_known_pattern_rgb_u16() { let packed = test_input(); let mut out = [0u16; 4 * 3]; - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut out, + 4, + ColorMatrix::Bt709, + false, + ); // Pixel 0: Y=4096 = limited-range floor → all channels 0 assert_eq!( @@ -226,7 +256,7 @@ mod tests { fn y216_known_pattern_rgba_u16() { let packed = test_input(); let mut out = [0u16; 4 * 4]; - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); assert_eq!(&out[0..4], &[0, 0, 0, 0xFFFF]); assert_eq!(&out[4..8], &[32618, 32618, 32618, 0xFFFF]); @@ -242,7 +272,7 @@ mod tests { fn y216_luma_extract() { let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555]; let mut out = [0u8; 2]; - y216_to_luma_row(&packed, &mut out, 2); + y216_to_luma_row::(&packed, &mut out, 2); assert_eq!(out[0], 0xAB, "Y0 luma u8"); assert_eq!(out[1], 0xCD, "Y1 luma u8"); } @@ -253,8 +283,55 @@ mod tests { fn y216_luma_u16_extract() { let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555]; let mut out = [0u16; 2]; - y216_to_luma_u16_row(&packed, &mut out, 2); + y216_to_luma_u16_row::(&packed, &mut out, 2); assert_eq!(out[0], 0xAB12, "Y0 luma u16"); assert_eq!(out[1], 0xCD34, "Y1 luma u16"); } + + // ---- BE=true parity tests ------------------------------------------- + + /// Verify byte-swapped Y216 input + BE=true matches LE+BE=false for RGB u8. + #[test] + fn y216_be_rgb_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut out_le = [0u8; 4 * 3]; + let mut out_be = [0u8; 4 * 3]; + y216_to_rgb_or_rgba_row::(&le, &mut out_le, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&be, &mut out_be, 4, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "BE and LE RGB paths must match"); + } + + #[test] + fn y216_be_rgb_u16_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut out_le = [0u16; 4 * 3]; + let mut out_be = [0u16; 4 * 3]; + y216_to_rgb_u16_or_rgba_u16_row::(&le, &mut out_le, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::(&be, &mut out_be, 4, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "BE and LE RGB u16 paths must match"); + } + + #[test] + fn y216_be_luma_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut luma_le = [0u8; 4]; + let mut luma_be = [0u8; 4]; + y216_to_luma_row::(&le, &mut luma_le, 4); + y216_to_luma_row::(&be, &mut luma_be, 4); + assert_eq!(luma_le, luma_be, "BE and LE luma paths must match"); + } + + #[test] + fn y216_be_luma_u16_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut luma_le = [0u16; 4]; + let mut luma_be = [0u16; 4]; + y216_to_luma_u16_row::(&le, &mut luma_le, 4); + y216_to_luma_u16_row::(&be, &mut luma_be, 4); + assert_eq!(luma_le, luma_be, "BE and LE luma_u16 paths must match"); + } } diff --git a/src/row/scalar/y2xx.rs b/src/row/scalar/y2xx.rs index 51aa7ba0..d3c7a8b3 100644 --- a/src/row/scalar/y2xx.rs +++ b/src/row/scalar/y2xx.rs @@ -10,6 +10,15 @@ //! `BITS` (mirrors `v210.rs`'s use of `range_params_n` / //! `chroma_bias` / `q15_scale` / `q15_chroma`, just sourced from //! Y2xx's u16 packed quadruples rather than v210's 16-byte words). +//! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each `u16` element in `packed` is stored in +//! big-endian byte order (high byte first). The `` +//! const-generic gates `load_endian_u16::` at each sample read +//! site; on LE targets the `BE = false` path is identical to the +//! previous plain slice index. On LE hosts with `BE = false` the +//! compiler eliminates the branch entirely. use super::*; @@ -31,12 +40,14 @@ const fn rshift_bits(sample: u16) -> u16 { /// (downshifted from the native BITS Q15 pipeline via /// `range_params_n::`). /// +/// `BE = true` selects big-endian wire decoding for each u16 sample. +/// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2` (one u16 quadruple per chroma pair). /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) fn y2xx_n_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -60,12 +71,15 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row( // One chroma pair (= 2 pixels) per iter. let pairs = width / 2; + // SAFETY: bounds checked by the debug_asserts above; p * 4 + 4 <= width * 2 + // because pairs = width / 2, so p < pairs means p * 4 + 4 <= width * 2. + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]) as i32; - let u = rshift_bits::(q[1]) as i32; - let y1 = rshift_bits::(q[2]) as i32; - let v = rshift_bits::(q[3]) as i32; + let off4 = p * 4 * 2; // byte offset to quadruple p (4 u16 = 8 bytes) + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }) as i32; + let u = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 2)) }) as i32; + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }) as i32; + let v = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 6)) }) as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -96,13 +110,18 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row( /// /// `ALPHA = true` writes a 4-element-per-pixel output with α = /// `(1 << BITS) - 1` (opaque maximum at the native depth). +/// `BE = true` selects big-endian wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -127,12 +146,13 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]) as i32; - let u = rshift_bits::(q[1]) as i32; - let y1 = rshift_bits::(q[2]) as i32; - let v = rshift_bits::(q[3]) as i32; + let off4 = p * 4 * 2; + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }) as i32; + let u = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 2)) }) as i32; + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }) as i32; + let v = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 6)) }) as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -158,13 +178,14 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row> (BITS - 8)`. Bypasses the YUV → RGB pipeline entirely. +/// `BE = true` selects big-endian wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_luma_row( +pub(crate) fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -180,10 +201,11 @@ pub(crate) fn y2xx_n_to_luma_row( debug_assert!(luma_out.len() >= width, "luma row too short"); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]); - let y1 = rshift_bits::(q[2]); + let off4 = p * 4 * 2; + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }); + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }); luma_out[p * 2] = (y0 >> (BITS - 8)) as u8; luma_out[p * 2 + 1] = (y1 >> (BITS - 8)) as u8; } @@ -191,14 +213,15 @@ pub(crate) fn y2xx_n_to_luma_row( /// Y2xx → native-depth `u16` luma (low-bit-packed). Each output /// `u16` carries the source's BITS-bit Y value in its low BITS bits -/// (upper `(16 - BITS)` bits zero). +/// (upper `(16 - BITS)` bits zero). `BE = true` selects big-endian +/// wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_luma_u16_row( +pub(crate) fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -214,10 +237,11 @@ pub(crate) fn y2xx_n_to_luma_u16_row( debug_assert!(luma_out.len() >= width, "luma row too short"); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - luma_out[p * 2] = rshift_bits::(q[0]); - luma_out[p * 2 + 1] = rshift_bits::(q[2]); + let off4 = p * 4 * 2; + luma_out[p * 2] = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }); + luma_out[p * 2 + 1] = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }); } } @@ -227,39 +251,47 @@ pub(crate) fn y2xx_n_to_luma_u16_row( // BITS=12 wrappers (`y212_to_*_row`) without further kernel changes. /// Public Y210 (BITS=10) → packed RGB / RGBA u8 wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_rgb_or_rgba_row( +pub(crate) fn y210_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_or_rgba_row::<10, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y210 → packed `u16` RGB / RGBA wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y210 → 8-bit luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { - y2xx_n_to_luma_row::<10>(packed, luma_out, width); +pub(crate) fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { + y2xx_n_to_luma_row::<10, BE>(packed, luma_out, width); } /// Public Y210 → native-depth `u16` luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { - y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); +pub(crate) fn y210_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { + y2xx_n_to_luma_u16_row::<10, BE>(packed, luma_out, width); } // ---- Public Y212 (BITS=12) wrappers ------------------------------------ @@ -268,39 +300,47 @@ pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: // SIMD code — the per-arch backends already accept BITS ∈ {10, 12}. /// Public Y212 (BITS=12) → packed RGB / RGBA u8 wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_rgb_or_rgba_row( +pub(crate) fn y212_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_or_rgba_row::<12, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y212 → packed `u16` RGB / RGBA wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y212 → 8-bit luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { - y2xx_n_to_luma_row::<12>(packed, luma_out, width); +pub(crate) fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { + y2xx_n_to_luma_row::<12, BE>(packed, luma_out, width); } /// Public Y212 → native-depth `u16` luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { - y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); +pub(crate) fn y212_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { + y2xx_n_to_luma_u16_row::<12, BE>(packed, luma_out, width); } #[cfg(all(test, feature = "std"))] @@ -329,12 +369,17 @@ mod tests { buf } + /// Byte-swap every u16 in a slice to produce the BE-encoded form. + fn to_be_u16(le: &[u16]) -> std::vec::Vec { + le.iter().map(|&v| v.swap_bytes()).collect() + } + #[test] fn scalar_y210_to_rgb_gray_is_gray() { // Full-range gray: Y=512, U=V=512 (10-bit center) → RGB ~128. let buf = solid_y210(8, 512, 512, 512); let mut rgb = [0u8; 8 * 3]; - y210_to_rgb_or_rgba_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -346,7 +391,7 @@ mod tests { fn scalar_y210_to_rgba_alpha_is_opaque() { let buf = solid_y210(8, 512, 512, 512); let mut rgba = [0u8; 8 * 4]; - y210_to_rgb_or_rgba_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert_eq!(px[3], 0xFF); } @@ -357,7 +402,7 @@ mod tests { // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023). let buf = solid_y210(8, 512, 512, 512); let mut rgb = [0u16; 8 * 3]; - y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(512) <= 2, "px expected ~512, got {}", px[0]); assert_eq!(px[0], px[1]); @@ -369,7 +414,7 @@ mod tests { fn scalar_y210_to_rgba_u16_alpha_is_max() { let buf = solid_y210(8, 512, 512, 512); let mut rgba = [0u16; 8 * 4]; - y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1"); } @@ -388,7 +433,7 @@ mod tests { buf[i * 4 + 3] = 128u16 << 6; // V } let mut luma = [0u8; 6]; - y210_to_luma_row(&buf, &mut luma, 6); + y210_to_luma_row::(&buf, &mut luma, 6); assert_eq!(luma[0], (100u16 >> 2) as u8); assert_eq!(luma[1], (200u16 >> 2) as u8); assert_eq!(luma[2], (300u16 >> 2) as u8); @@ -408,7 +453,7 @@ mod tests { buf[i * 4 + 3] = 128u16 << 6; } let mut luma = [0u16; 6]; - y210_to_luma_u16_row(&buf, &mut luma, 6); + y210_to_luma_u16_row::(&buf, &mut luma, 6); assert_eq!(luma[0], 100); assert_eq!(luma[1], 200); assert_eq!(luma[2], 300); @@ -416,4 +461,64 @@ mod tests { assert_eq!(luma[4], 500); assert_eq!(luma[5], 600); } + + // ---- BE=true parity tests ------------------------------------------- + + /// Verify that byte-swapped Y210 input + BE=true produces the same + /// RGB output as the native LE input + BE=false. + #[test] + fn scalar_y210_be_rgb_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut rgb_le = [0u8; 8 * 3]; + let mut rgb_be = [0u8; 8 * 3]; + y210_to_rgb_or_rgba_row::(&le, &mut rgb_le, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&be, &mut rgb_be, 8, ColorMatrix::Bt709, true); + assert_eq!( + rgb_le, rgb_be, + "BE and LE paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_rgb_u16_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut out_le = [0u16; 8 * 3]; + let mut out_be = [0u16; 8 * 3]; + y210_to_rgb_u16_or_rgba_u16_row::(&le, &mut out_le, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&be, &mut out_be, 8, ColorMatrix::Bt709, true); + assert_eq!( + out_le, out_be, + "BE and LE u16 paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_luma_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut luma_le = [0u8; 8]; + let mut luma_be = [0u8; 8]; + y210_to_luma_row::(&le, &mut luma_le, 8); + y210_to_luma_row::(&be, &mut luma_be, 8); + assert_eq!( + luma_le, luma_be, + "BE and LE luma paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_luma_u16_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut luma_le = [0u16; 8]; + let mut luma_be = [0u16; 8]; + y210_to_luma_u16_row::(&le, &mut luma_le, 8); + y210_to_luma_u16_row::(&be, &mut luma_be, 8); + assert_eq!( + luma_le, luma_be, + "BE and LE luma_u16 paths must produce identical output" + ); + } } diff --git a/src/sinker/mixed/v210.rs b/src/sinker/mixed/v210.rs index 42da55d3..e59a624a 100644 --- a/src/sinker/mixed/v210.rs +++ b/src/sinker/mixed/v210.rs @@ -212,6 +212,7 @@ impl PixelSink for MixedSinker<'_, V210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 10-bit Y values at native depth. @@ -221,6 +222,7 @@ impl PixelSink for MixedSinker<'_, V210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -241,6 +243,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -261,6 +264,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -291,6 +295,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -307,7 +312,15 @@ impl PixelSink for MixedSinker<'_, V210> { w, h, )?; - v210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + v210_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y210.rs b/src/sinker/mixed/y210.rs index cf9caaa5..430b2955 100644 --- a/src/sinker/mixed/y210.rs +++ b/src/sinker/mixed/y210.rs @@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 10-bit Y values at native depth. @@ -222,6 +223,7 @@ impl PixelSink for MixedSinker<'_, Y210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -242,6 +244,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -262,6 +265,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -292,6 +296,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -308,7 +313,15 @@ impl PixelSink for MixedSinker<'_, Y210> { w, h, )?; - y210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y210_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y212.rs b/src/sinker/mixed/y212.rs index e7c1c959..1582e61e 100644 --- a/src/sinker/mixed/y212.rs +++ b/src/sinker/mixed/y212.rs @@ -211,6 +211,7 @@ impl PixelSink for MixedSinker<'_, Y212> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 12-bit Y values at native depth. @@ -220,6 +221,7 @@ impl PixelSink for MixedSinker<'_, Y212> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -240,6 +242,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -260,6 +263,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -290,6 +294,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -306,7 +311,15 @@ impl PixelSink for MixedSinker<'_, Y212> { w, h, )?; - y212_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y212_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y216.rs b/src/sinker/mixed/y216.rs index a8ce416d..4fdbb951 100644 --- a/src/sinker/mixed/y216.rs +++ b/src/sinker/mixed/y216.rs @@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y216> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 16-bit Y values at native depth (direct @@ -223,6 +224,7 @@ impl PixelSink for MixedSinker<'_, Y216> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -243,6 +245,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -263,6 +266,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -293,6 +297,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -309,7 +314,15 @@ impl PixelSink for MixedSinker<'_, Y216> { w, h, )?; - y216_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y216_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row(