From 8022fb7de9ea4921705101a2bb0946b682ec3b2b Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 00:12:05 +1200 Subject: [PATCH 1/3] feat(be-tier4): BE support for Y210/Y212/Y216/V210 row kernels Adds `` const-generic gating to all four Tier 4 packed YUV 4:2:2 formats across scalar, NEON, x86 (SSE4.1/AVX2/AVX512), and WASM SIMD128 backends, plus dispatch functions and sinker call sites. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon/tests/v210.rs | 36 +- src/row/arch/neon/tests/y216.rs | 28 +- src/row/arch/neon/tests/y2xx.rs | 50 +- src/row/arch/neon/v210.rs | 90 ++-- src/row/arch/neon/y216.rs | 630 +++++++++++----------- src/row/arch/neon/y2xx.rs | 346 ++++++------ src/row/arch/wasm_simd128/tests/v210.rs | 36 +- src/row/arch/wasm_simd128/tests/y216.rs | 36 +- src/row/arch/wasm_simd128/tests/y2xx.rs | 50 +- src/row/arch/wasm_simd128/v210.rs | 44 +- src/row/arch/wasm_simd128/y216.rs | 490 +++++++++-------- src/row/arch/wasm_simd128/y2xx.rs | 425 ++++++++------- src/row/arch/x86_avx2/tests/v210.rs | 36 +- src/row/arch/x86_avx2/tests/y216.rs | 28 +- src/row/arch/x86_avx2/tests/y2xx.rs | 50 +- src/row/arch/x86_avx2/v210.rs | 44 +- src/row/arch/x86_avx2/y216.rs | 654 +++++++++++----------- src/row/arch/x86_avx2/y2xx.rs | 547 ++++++++++--------- src/row/arch/x86_avx512/tests/v210.rs | 36 +- src/row/arch/x86_avx512/tests/y216.rs | 28 +- src/row/arch/x86_avx512/tests/y2xx.rs | 50 +- src/row/arch/x86_avx512/v210.rs | 44 +- src/row/arch/x86_avx512/y216.rs | 609 +++++++++++---------- src/row/arch/x86_avx512/y2xx.rs | 479 +++++++++-------- src/row/arch/x86_sse41/tests/v210.rs | 36 +- src/row/arch/x86_sse41/tests/y216.rs | 28 +- src/row/arch/x86_sse41/tests/y2xx.rs | 50 +- src/row/arch/x86_sse41/v210.rs | 44 +- src/row/arch/x86_sse41/y216.rs | 688 ++++++++++++------------ src/row/arch/x86_sse41/y2xx.rs | 419 ++++++++------- src/row/dispatch/v210.rs | 279 ++++++++-- src/row/dispatch/y210.rs | 275 ++++++++-- src/row/dispatch/y212.rs | 275 ++++++++-- src/row/dispatch/y216.rs | 282 ++++++++-- src/row/mod.rs | 11 +- src/row/scalar/mod.rs | 36 ++ src/row/scalar/v210.rs | 193 +++++-- src/row/scalar/y216.rs | 129 ++++- src/row/scalar/y2xx.rs | 191 +++++-- src/sinker/mixed/v210.rs | 15 +- src/sinker/mixed/y210.rs | 15 +- src/sinker/mixed/y212.rs | 15 +- src/sinker/mixed/y216.rs | 15 +- 43 files changed, 4682 insertions(+), 3180 deletions(-) diff --git a/src/row/arch/neon/tests/v210.rs b/src/row/arch/neon/tests/v210.rs index b82bdab4..be979537 100644 --- a/src/row/arch/neon/tests/v210.rs +++ b/src/row/arch/neon/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v210→luma u16 diverges (width={width})"); } @@ -213,7 +213,7 @@ fn neon_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "neon v210 luma reorder bug"); @@ -222,9 +222,15 @@ fn neon_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/neon/tests/y216.rs b/src/row/arch/neon/tests/y216.rs index 8d379a2d..1a982f4d 100644 --- a/src/row/arch/neon/tests/y216.rs +++ b/src/row/arch/neon/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -32,9 +32,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -48,9 +48,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y216→luma diverges (width={width})"); } @@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y216→luma u16 diverges (width={width})"); } @@ -142,7 +142,7 @@ fn neon_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "NEON y216 luma_u16 reorder bug"); @@ -151,9 +151,15 @@ fn neon_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/neon/tests/y2xx.rs b/src/row/arch/neon/tests/y2xx.rs index 892e0d14..d12a51d4 100644 --- a/src/row/arch/neon/tests/y2xx.rs +++ b/src/row/arch/neon/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -95,9 +95,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -109,9 +109,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -123,9 +123,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -137,9 +139,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -151,9 +155,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y2xx<{BITS}>→luma diverges (width={width})"); } @@ -162,9 +166,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON y2xx<{BITS}>→luma u16 diverges (width={width})"); } @@ -225,15 +229,15 @@ fn neon_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "NEON y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -241,7 +245,7 @@ fn neon_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -253,17 +257,17 @@ fn neon_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "NEON y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "NEON y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/neon/v210.rs b/src/row/arch/neon/v210.rs index 0d9f9748..ba406d7a 100644 --- a/src/row/arch/neon/v210.rs +++ b/src/row/arch/neon/v210.rs @@ -18,34 +18,9 @@ use core::arch::aarch64::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; -/// Loads 16 bytes as 4 × `u32` in **little-endian** order regardless -/// of host endianness. v210 words are documented LE; on big-endian -/// aarch64 (rare — `aarch64_be-*` custom targets) the plain -/// `vld1q_u32` would put bytes in reversed positions within each -/// lane and corrupt every subsequent shift-and-mask. Mirrors the -/// `x2_load_le_u32x4` helper in `packed_rgb.rs` (X2RGB10 / X2BGR10 -/// share the same LE-word constraint). Defining a local helper -/// avoids cross-file visibility hassle since `x2_load_le_u32x4` is -/// `pub(super) fn` but not re-exported via the mod's glob. -/// -/// # Safety -/// -/// Caller must ensure `ptr` has at least 16 bytes readable. -#[inline(always)] -unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t { - unsafe { - let raw = vld1q_u32(ptr as *const u32); - if cfg!(target_endian = "big") { - vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(raw))) - } else { - raw - } - } -} - /// Unpacks one 16-byte v210 word into three u16x8 vectors holding /// 10-bit samples in their low bits: /// - `y_vec`: lanes 0..6 = Y0..Y5 (lanes 6, 7 are don't-care). @@ -65,10 +40,12 @@ unsafe fn v210_load_le_u32x4(ptr: *const u8) -> uint32x4_t { /// Caller must ensure `ptr` has at least 16 bytes readable. #[inline] #[target_feature(enable = "neon")] -unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint16x8_t) { +unsafe fn unpack_v210_word_neon( + ptr: *const u8, +) -> (uint16x8_t, uint16x8_t, uint16x8_t) { // SAFETY: caller obligation — `ptr` has 16 bytes readable. unsafe { - let words = v210_load_le_u32x4(ptr); + let words = load_endian_u32x4::(ptr); let mask10 = vdupq_n_u32(0x3FF); let low10 = vandq_u32(words, mask10); let mid10 = vandq_u32(vshrq_n_u32::<10>(words), mask10); @@ -132,12 +109,12 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint } } -/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA`: -/// `false` writes 3 bytes per pixel, `true` writes 4 bytes per -/// pixel with `α = 0xFF`. Output bit depth is u8 (downshifted from +/// NEON v210 → packed RGB / RGBA (u8). Const-generic on `ALPHA` and `BE`. +/// `BE = true` selects big-endian u32 word decoding (each 32-bit packed +/// word stored BE on the wire). Output bit depth is u8 (downshifted from /// the native 10-bit Q15 pipeline via `range_params_n::<10, 8>`). /// -/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::` for +/// Byte-identical to `scalar::v210_to_rgb_or_rgba_row::` for /// every input. /// /// # Safety @@ -148,7 +125,7 @@ unsafe fn unpack_v210_word_neon(ptr: *const u8) -> (uint16x8_t, uint16x8_t, uint /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -185,7 +162,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = vdupq_n_s32(coeffs.b_v()); for w in 0..full_words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); let y_i16 = vreinterpretq_s16_u16(y_vec); @@ -255,14 +232,21 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } /// NEON v210 → packed `u16` RGB / RGBA at native 10-bit depth -/// (low-bit-packed). Byte-identical to -/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::`. +/// (low-bit-packed). `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to +/// `scalar::v210_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -272,7 +256,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -309,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = vdupq_n_s32(coeffs.b_v()); for w in 0..full_words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); let y_i16 = vreinterpretq_s16_u16(y_vec); let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); @@ -362,7 +346,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -375,7 +359,8 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// NEON v210 → 8-bit luma. Y values are downshifted from 10-bit to /// 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely. -/// Byte-identical to `scalar::v210_to_luma_row`. +/// `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to `scalar::v210_to_luma_row::`. /// /// # Safety /// @@ -385,7 +370,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let full_words = width / 6; @@ -395,7 +384,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..full_words { - let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8. let y_u8 = vqmovn_u16(vshrq_n_u16::<2>(y_vec)); // Store 6 of the 8 lanes: stack buffer + copy_from_slice. @@ -408,14 +397,15 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } /// NEON v210 → native-depth `u16` luma (low-bit-packed). Each output /// `u16` carries the source's 10-bit Y value in its low 10 bits. -/// Byte-identical to `scalar::v210_to_luma_u16_row`. +/// `BE = true` selects big-endian u32 word decoding. +/// Byte-identical to `scalar::v210_to_luma_u16_row::`. /// /// # Safety /// @@ -425,7 +415,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let full_words = width / 6; @@ -435,7 +429,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..full_words { - let (y_vec, _, _) = unpack_v210_word_neon(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_neon::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; vst1q_u16(tmp.as_mut_ptr(), y_vec); @@ -446,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[full_words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/neon/y216.rs b/src/row/arch/neon/y216.rs index 8aaa8664..01a26e62 100644 --- a/src/row/arch/neon/y216.rs +++ b/src/row/arch/neon/y216.rs @@ -32,8 +32,9 @@ use crate::{ColorMatrix, row::scalar}; // ---- u8 output (i32 chroma, 16 px/iter) --------------------------------- /// NEON Y216 → packed u8 RGB or RGBA. +/// `BE = true` bypasses NEON and uses scalar for the full row. /// -/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::`. +/// Byte-identical to `scalar::y216_to_rgb_or_rgba_row::`. /// /// # Safety /// @@ -43,7 +44,7 @@ use crate::{ColorMatrix, row::scalar}; /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -61,128 +62,137 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = vdupq_n_s32(RND); - // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off. - // Y values are full u16 (0..65535), so we must use u16-aware widening - // rather than reinterpreting as i16 (which would corrupt values > 32767). - let y_off_v = vdupq_n_s32(y_off); - let y_scale_v = vdupq_n_s32(y_scale); - let c_scale_v = vdupq_n_s32(c_scale); - let bias_v = vdupq_n_s16(bias as i16); - let cru = vdupq_n_s32(coeffs.r_u()); - let crv = vdupq_n_s32(coeffs.r_v()); - let cgu = vdupq_n_s32(coeffs.g_u()); - let cgv = vdupq_n_s32(coeffs.g_v()); - let cbu = vdupq_n_s32(coeffs.b_u()); - let cbv = vdupq_n_s32(coeffs.b_v()); - + // BE=true: bypass NEON; scalar handles full row below. let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16). - // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - - // Extract U and V from interleaved chroma via vuzp. - // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3] - // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid. - // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid. - let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1); - let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1); - let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1); - let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1); - - // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so - // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16. - let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v); - let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v); - let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v); - let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v); - - // Widen to i32x4 for Q15 multiply. - // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs - // discarded by vzip1q_s16 below which only uses lanes 0..3). - let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16)); - let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16)); - let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16)); - let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16)); - let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16)); - let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16)); - let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16)); - let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16)); - - // Q15 chroma scale. - let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v)); - let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v)); - let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v)); - let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v)); - let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v)); - let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v)); - let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v)); - let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v)); - - // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi; - // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care). - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); - - // Duplicate chroma into Y-pair slots (4:2:2): - // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3] - let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo); - let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo); - let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo); - let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi); - let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi); - let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi); - - // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies - // (y - y_off) * y_scale Q15, narrows to i16x8. Avoids the i16 - // overflow that `scale_y` would cause for Y values > 32767. - let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v); - - // Saturating add; narrow to u8x8. - let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo)); - let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo)); - let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo)); - let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi)); - let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi)); - let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi)); - - if ALPHA { - let alpha = vdup_n_u8(0xFF); - vst4_u8( - out.as_mut_ptr().add(x * 4), - uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha), - ); - vst4_u8( - out.as_mut_ptr().add(x * 4 + 32), - uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha), - ); - } else { - vst3_u8( - out.as_mut_ptr().add(x * 3), - uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8), - ); - vst3_u8( - out.as_mut_ptr().add(x * 3 + 24), - uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8), - ); + if !BE { + let rnd_v = vdupq_n_s32(RND); + // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off. + // Y values are full u16 (0..65535), so we must use u16-aware widening + // rather than reinterpreting as i16 (which would corrupt values > 32767). + let y_off_v = vdupq_n_s32(y_off); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 16 <= width { + // Two vld2q_u16 calls: each deinterleaves 8 px (16 u16). + // ptr offset x*2 u16 for lo-group, x*2+16 u16 for hi-group. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + + // Extract U and V from interleaved chroma via vuzp. + // pair_lo.1 = [U0,V0,U1,V1,U2,V2,U3,V3] + // vuzp1q_u16(c,c) = [U0,U1,U2,U3, U0,U1,U2,U3] — low 4 valid. + // vuzp2q_u16(c,c) = [V0,V1,V2,V3, V0,V1,V2,V3] — low 4 valid. + let u_lo_vec = vuzp1q_u16(pair_lo.1, pair_lo.1); + let v_lo_vec = vuzp2q_u16(pair_lo.1, pair_lo.1); + let u_hi_vec = vuzp1q_u16(pair_hi.1, pair_hi.1); + let v_hi_vec = vuzp2q_u16(pair_hi.1, pair_hi.1); + + // Chroma bias subtraction: chroma ∈ [0,65535], bias=32768, so + // (chroma - bias) ∈ [-32768, 32767] which fits exactly in i16. + let u_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(u_lo_vec), bias_v); + let v_lo_i16 = vsubq_s16(vreinterpretq_s16_u16(v_lo_vec), bias_v); + let u_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(u_hi_vec), bias_v); + let v_hi_i16 = vsubq_s16(vreinterpretq_s16_u16(v_hi_vec), bias_v); + + // Widen to i32x4 for Q15 multiply. + // _0 = low 4 (valid), _1 = high 4 (duplicates; don't-care outputs + // discarded by vzip1q_s16 below which only uses lanes 0..3). + let u_lo_i32_0 = vmovl_s16(vget_low_s16(u_lo_i16)); + let u_lo_i32_1 = vmovl_s16(vget_high_s16(u_lo_i16)); + let v_lo_i32_0 = vmovl_s16(vget_low_s16(v_lo_i16)); + let v_lo_i32_1 = vmovl_s16(vget_high_s16(v_lo_i16)); + let u_hi_i32_0 = vmovl_s16(vget_low_s16(u_hi_i16)); + let u_hi_i32_1 = vmovl_s16(vget_high_s16(u_hi_i16)); + let v_hi_i32_0 = vmovl_s16(vget_low_s16(v_hi_i16)); + let v_hi_i32_1 = vmovl_s16(vget_high_s16(v_hi_i16)); + + // Q15 chroma scale. + let u_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_0, c_scale_v), rnd_v)); + let u_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32_1, c_scale_v), rnd_v)); + let v_d_lo_0 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_0, c_scale_v), rnd_v)); + let v_d_lo_1 = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32_1, c_scale_v), rnd_v)); + let u_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_0, c_scale_v), rnd_v)); + let u_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32_1, c_scale_v), rnd_v)); + let v_d_hi_0 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_0, c_scale_v), rnd_v)); + let v_d_hi_1 = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32_1, c_scale_v), rnd_v)); + + // Build 8-lane chroma vectors (4 valid in lo + 4 duplicate in hi; + // `chroma_i16x8` produces lanes 0..3 correct, lanes 4..7 don't-care). + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_0, v_d_lo_0, u_d_lo_1, v_d_lo_1, rnd_v); + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_0, v_d_hi_0, u_d_hi_1, v_d_hi_1, rnd_v); + + // Duplicate chroma into Y-pair slots (4:2:2): + // vzip1q_s16([c0,c1,c2,c3, …dup…], same) = [c0,c0,c1,c1,c2,c2,c3,c3] + let r_dup_lo = vzip1q_s16(r_chroma_lo, r_chroma_lo); + let g_dup_lo = vzip1q_s16(g_chroma_lo, g_chroma_lo); + let b_dup_lo = vzip1q_s16(b_chroma_lo, b_chroma_lo); + let r_dup_hi = vzip1q_s16(r_chroma_hi, r_chroma_hi); + let g_dup_hi = vzip1q_s16(g_chroma_hi, g_chroma_hi); + let b_dup_hi = vzip1q_s16(b_chroma_hi, b_chroma_hi); + + // Y scale using u16-aware helper: unsigned-widens u16 → i32, applies + // (y - y_off) * y_scale Q15, narrows to i16x8. Avoids the i16 + // overflow that `scale_y` would cause for Y values > 32767. + let y_lo_scaled = scale_y_u16_to_i16(pair_lo.0, y_off_v, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_u16_to_i16(pair_hi.0, y_off_v, y_scale_v, rnd_v); + + // Saturating add; narrow to u8x8. + let r_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, r_dup_lo)); + let g_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, g_dup_lo)); + let b_lo_u8 = vqmovun_s16(vqaddq_s16(y_lo_scaled, b_dup_lo)); + let r_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, r_dup_hi)); + let g_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, g_dup_hi)); + let b_hi_u8 = vqmovun_s16(vqaddq_s16(y_hi_scaled, b_dup_hi)); + + if ALPHA { + let alpha = vdup_n_u8(0xFF); + vst4_u8( + out.as_mut_ptr().add(x * 4), + uint8x8x4_t(r_lo_u8, g_lo_u8, b_lo_u8, alpha), + ); + vst4_u8( + out.as_mut_ptr().add(x * 4 + 32), + uint8x8x4_t(r_hi_u8, g_hi_u8, b_hi_u8, alpha), + ); + } else { + vst3_u8( + out.as_mut_ptr().add(x * 3), + uint8x8x3_t(r_lo_u8, g_lo_u8, b_lo_u8), + ); + vst3_u8( + out.as_mut_ptr().add(x * 3 + 24), + uint8x8x3_t(r_hi_u8, g_hi_u8, b_hi_u8), + ); + } + + x += 16; } + } // end if !BE - x += 16; - } - - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -192,7 +202,8 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// NEON Y216 → packed native-depth u16 RGB or RGBA. /// /// Uses i64 chroma (`chroma_i64x4`) to avoid overflow at 16-bit scales. -/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::`. +/// `BE = true` bypasses NEON and uses scalar for the full row. +/// Byte-identical to `scalar::y216_to_rgb_u16_or_rgba_u16_row::`. /// /// ## Pipeline /// @@ -211,7 +222,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -229,180 +240,183 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND: i32 = 1 << 14; unsafe { - let alpha_u16 = vdupq_n_u16(0xFFFF); - let rnd_v = vdupq_n_s32(RND); - let rnd64 = vdupq_n_s64(RND as i64); - let y_off_v = vdupq_n_s32(y_off); - let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32 - let c_scale_v = vdupq_n_s32(c_scale); - let bias_v = vdupq_n_s32(bias); - let cru = vdupq_n_s32(coeffs.r_u()); - let crv = vdupq_n_s32(coeffs.r_v()); - let cgu = vdupq_n_s32(coeffs.g_u()); - let cgv = vdupq_n_s32(coeffs.g_v()); - let cbu = vdupq_n_s32(coeffs.b_u()); - let cbv = vdupq_n_s32(coeffs.b_v()); - + // BE=true: bypass NEON; scalar handles full row below. let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - - // Extract U/V from chroma via vuzp. - // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid. - let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1); - let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1); - let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1); - let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1); - - // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d. - let u_d_lo = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))), - bias_v, + if !BE { + let alpha_u16 = vdupq_n_u16(0xFFFF); + let rnd_v = vdupq_n_s32(RND); + let rnd64 = vdupq_n_s64(RND as i64); + let y_off_v = vdupq_n_s32(y_off); + let y_scale_d = vdup_n_s32(y_scale); // int32x2_t for vmull_s32 + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s32(bias); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 16 <= width { + // Two vld2q_u16: each deinterleaves 8 px → 8 Y + [UV…] pairs. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + + // Extract U/V from chroma via vuzp. + // vuzp1q_u16(c,c) = [U0..U3, U0..U3]; use vget_low for 4 valid. + let u_lo_raw = vuzp1q_u16(pair_lo.1, pair_lo.1); + let v_lo_raw = vuzp2q_u16(pair_lo.1, pair_lo.1); + let u_hi_raw = vuzp1q_u16(pair_hi.1, pair_hi.1); + let v_hi_raw = vuzp2q_u16(pair_hi.1, pair_hi.1); + + // Widen 4 valid chroma samples, subtract bias, apply c_scale → u_d. + let u_d_lo = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_lo_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let v_d_lo = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))), - bias_v, + rnd_v, + )); + let v_d_lo = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_lo_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let u_d_hi = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))), - bias_v, + rnd_v, + )); + let u_d_hi = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_hi_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - let v_d_hi = q15_shift(vaddq_s32( - vmulq_s32( - vsubq_s32( - vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))), - bias_v, + rnd_v, + )); + let v_d_hi = q15_shift(vaddq_s32( + vmulq_s32( + vsubq_s32( + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(v_hi_raw))), + bias_v, + ), + c_scale_v, ), - c_scale_v, - ), - rnd_v, - )); - - // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow). - let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64); - let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64); - let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64); - let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64); - let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64); - let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64); - - // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). - // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3 - // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7 - let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo); - let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo); - let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo); - let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo); - let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo); - let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo); - let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi); - let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi); - let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi); - let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi); - let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi); - let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi); - - // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range. - // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64. - // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15. - let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0))); - let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0))); - let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0))); - let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0))); - let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64); - let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64); - let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64); - let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64); - - // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]). - // - // Alignment: - // ys_lo_0 = [Y0,Y1,Y2,Y3] r_cd_lo0 = [c0,c0,c1,c1] → pixels 0..3 - // ys_lo_1 = [Y4,Y5,Y6,Y7] r_cd_lo1 = [c2,c2,c3,c3] → pixels 4..7 - // ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5] → pixels 8..11 - // ys_hi_1 = [Y12..Y15] r_cd_hi1 = [c6,c6,c7,c7] → pixels 12..15 - // - // vcombine_u16(A, B) packs two u16x4 into one u16x8. - let r_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)), - ); - let g_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)), - ); - let b_lo_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)), - vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)), - ); - // hi group (Y8..Y15) - let r_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)), - ); - let g_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)), - ); - let b_hi_u16 = vcombine_u16( - vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)), - vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), - ); - - // Each u16x8 covers 8 pixels. Two stores per format (lo + hi). - // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes). - // Offset for lo: x*4 u16. Offset for hi: x*4+32 u16. - // For RGB: each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes). - // Offset for lo: x*3 u16. Offset for hi: x*3+24 u16. - if ALPHA { - vst4q_u16( - out.as_mut_ptr().add(x * 4), - uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + rnd_v, + )); + + // i64 chroma: 4 values → i32x4 (vmull_s32 widening to avoid i32 overflow). + let r_ch_lo = chroma_i64x4(cru, crv, u_d_lo, v_d_lo, rnd64); + let g_ch_lo = chroma_i64x4(cgu, cgv, u_d_lo, v_d_lo, rnd64); + let b_ch_lo = chroma_i64x4(cbu, cbv, u_d_lo, v_d_lo, rnd64); + let r_ch_hi = chroma_i64x4(cru, crv, u_d_hi, v_d_hi, rnd64); + let g_ch_hi = chroma_i64x4(cgu, cgv, u_d_hi, v_d_hi, rnd64); + let b_ch_hi = chroma_i64x4(cbu, cbv, u_d_hi, v_d_hi, rnd64); + + // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). + // vzip1q_s32([c0,c1,c2,c3], same) = [c0,c0,c1,c1] → Y0,Y1,Y2,Y3 + // vzip2q_s32([c0,c1,c2,c3], same) = [c2,c2,c3,c3] → Y4,Y5,Y6,Y7 + let r_cd_lo0 = vzip1q_s32(r_ch_lo, r_ch_lo); + let r_cd_lo1 = vzip2q_s32(r_ch_lo, r_ch_lo); + let g_cd_lo0 = vzip1q_s32(g_ch_lo, g_ch_lo); + let g_cd_lo1 = vzip2q_s32(g_ch_lo, g_ch_lo); + let b_cd_lo0 = vzip1q_s32(b_ch_lo, b_ch_lo); + let b_cd_lo1 = vzip2q_s32(b_ch_lo, b_ch_lo); + let r_cd_hi0 = vzip1q_s32(r_ch_hi, r_ch_hi); + let r_cd_hi1 = vzip2q_s32(r_ch_hi, r_ch_hi); + let g_cd_hi0 = vzip1q_s32(g_ch_hi, g_ch_hi); + let g_cd_hi1 = vzip2q_s32(g_ch_hi, g_ch_hi); + let b_cd_hi0 = vzip1q_s32(b_ch_hi, b_ch_hi); + let b_cd_hi1 = vzip2q_s32(b_ch_hi, b_ch_hi); + + // i64 Y scale: (y - y_off) * y_scale can reach ~2.35×10⁹ at limited range. + // Split each 8-lane Y into two i32x4 halves for scale_y_u16_i64. + // y_lo_0 = Y0..Y3, y_lo_1 = Y4..Y7; y_hi_0 = Y8..Y11, y_hi_1 = Y12..Y15. + let y_lo_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_lo.0))); + let y_lo_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_lo.0))); + let y_hi_0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(pair_hi.0))); + let y_hi_1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(pair_hi.0))); + let ys_lo_0 = scale_y_u16_i64(y_lo_0, y_off_v, y_scale_d, rnd64); + let ys_lo_1 = scale_y_u16_i64(y_lo_1, y_off_v, y_scale_d, rnd64); + let ys_hi_0 = scale_y_u16_i64(y_hi_0, y_off_v, y_scale_d, rnd64); + let ys_hi_1 = scale_y_u16_i64(y_hi_1, y_off_v, y_scale_d, rnd64); + + // Y + chroma; vqmovun_s32 saturates i32 → u16 (clamps [0, 65535]). + // + // Alignment: + // ys_lo_0 = [Y0,Y1,Y2,Y3] r_cd_lo0 = [c0,c0,c1,c1] → pixels 0..3 + // ys_lo_1 = [Y4,Y5,Y6,Y7] r_cd_lo1 = [c2,c2,c3,c3] → pixels 4..7 + // ys_hi_0 = [Y8,Y9,Y10,Y11] r_cd_hi0 = [c4,c4,c5,c5] → pixels 8..11 + // ys_hi_1 = [Y12..Y15] r_cd_hi1 = [c6,c6,c7,c7] → pixels 12..15 + // + // vcombine_u16(A, B) packs two u16x4 into one u16x8. + let r_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, r_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, r_cd_lo1)), ); - vst4q_u16( - out.as_mut_ptr().add(x * 4 + 32), - uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + let g_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, g_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, g_cd_lo1)), ); - } else { - vst3q_u16( - out.as_mut_ptr().add(x * 3), - uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + let b_lo_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_lo_0, b_cd_lo0)), + vqmovun_s32(vaddq_s32(ys_lo_1, b_cd_lo1)), ); - vst3q_u16( - out.as_mut_ptr().add(x * 3 + 24), - uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + // hi group (Y8..Y15) + let r_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, r_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, r_cd_hi1)), + ); + let g_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, g_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, g_cd_hi1)), + ); + let b_hi_u16 = vcombine_u16( + vqmovun_s32(vaddq_s32(ys_hi_0, b_cd_hi0)), + vqmovun_s32(vaddq_s32(ys_hi_1, b_cd_hi1)), ); - } - x += 16; - } + // Each u16x8 covers 8 pixels. Two stores per format (lo + hi). + // For ALPHA: each vst4q_u16 writes 8 RGBA pixels (8 × 4 × 2 = 64 bytes). + // Offset for lo: x*4 u16. Offset for hi: x*4+32 u16. + // For RGB: each vst3q_u16 writes 8 RGB pixels (8 × 3 × 2 = 48 bytes). + // Offset for lo: x*3 u16. Offset for hi: x*3+24 u16. + if ALPHA { + vst4q_u16( + out.as_mut_ptr().add(x * 4), + uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + ); + vst4q_u16( + out.as_mut_ptr().add(x * 4 + 32), + uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + ); + } else { + vst3q_u16( + out.as_mut_ptr().add(x * 3), + uint16x8x3_t(r_lo_u16, g_lo_u16, b_lo_u16), + ); + vst3q_u16( + out.as_mut_ptr().add(x * 3 + 24), + uint16x8x3_t(r_hi_u16, g_hi_u16, b_hi_u16), + ); + } + + x += 16; + } + } // end if !BE - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -416,8 +430,9 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // ---- Luma u8 (16 px/iter) ----------------------------------------------- /// NEON Y216 → u8 luma. Extracts Y via `>> 8`. +/// `BE = true` bypasses NEON and uses scalar. /// -/// Byte-identical to `scalar::y216_to_luma_row`. +/// Byte-identical to `scalar::y216_to_luma_row::`. /// /// # Safety /// @@ -427,29 +442,35 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { let mut x = 0usize; - while x + 16 <= width { - // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded. - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - // >> 8 narrows u16 → u8 (high byte of each Y sample). - let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0); - let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0); - vst1_u8(out.as_mut_ptr().add(x), y_lo_u8); - vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8); - x += 16; + if !BE { + while x + 16 <= width { + // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded. + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + // >> 8 narrows u16 → u8 (high byte of each Y sample). + let y_lo_u8 = vshrn_n_u16::<8>(pair_lo.0); + let y_hi_u8 = vshrn_n_u16::<8>(pair_hi.0); + vst1_u8(out.as_mut_ptr().add(x), y_lo_u8); + vst1_u8(out.as_mut_ptr().add(x + 8), y_hi_u8); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -457,8 +478,9 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // ---- Luma u16 (16 px/iter) ---------------------------------------------- /// NEON Y216 → u16 luma. Direct copy of Y samples (no shift). +/// `BE = true` bypasses NEON and uses scalar. /// -/// Byte-identical to `scalar::y216_to_luma_u16_row`. +/// Byte-identical to `scalar::y216_to_luma_u16_row::`. /// /// # Safety /// @@ -468,26 +490,32 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { let mut x = 0usize; - while x + 16 <= width { - let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); - let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); - // Direct copy — Y samples are already full 16-bit (no shift needed). - vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0); - vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0); - x += 16; + if !BE { + while x + 16 <= width { + let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); + let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); + // Direct copy — Y samples are already full 16-bit (no shift needed). + vst1q_u16(out.as_mut_ptr().add(x), pair_lo.0); + vst1q_u16(out.as_mut_ptr().add(x + 8), pair_hi.0); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/neon/y2xx.rs b/src/row/arch/neon/y2xx.rs index 0c02365f..72920362 100644 --- a/src/row/arch/neon/y2xx.rs +++ b/src/row/arch/neon/y2xx.rs @@ -83,11 +83,12 @@ unsafe fn unpack_y2xx_8px_neon( } /// NEON Y2xx → packed RGB / RGBA u8. Const‑generic over -/// `BITS ∈ {10, 12}` and `ALPHA ∈ {false, true}`. Output bit depth is -/// u8 (downshifted from the native BITS Q15 pipeline via -/// `range_params_n::`). +/// `BITS ∈ {10, 12}`, `ALPHA ∈ {false, true}`, and `BE ∈ {false, true}`. +/// `BE = true` selects big-endian u16 decoding for the input samples. +/// When `BE = true` the SIMD path is bypassed and the scalar kernel +/// handles the full row (the NEON loop only handles native-endian data). /// -/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::` +/// Byte‑identical to `scalar::y2xx_n_to_rgb_or_rgba_row::` /// for every input. /// /// # Safety @@ -98,7 +99,11 @@ unsafe fn unpack_y2xx_8px_neon( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -126,86 +131,90 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. 8 valid lanes per channel. - let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup)); - let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup)); - let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup)); - - if ALPHA { - let alpha = vdup_n_u8(0xFF); - vst4_u8( - out.as_mut_ptr().add(x * 4), - uint8x8x4_t(r_u8, g_u8, b_u8, alpha), - ); - } else { - vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8)); + if !BE { + let rnd_v = vdupq_n_s32(RND); + let y_off_v = vdupq_n_s16(y_off as i16); + let y_scale_v = vdupq_n_s32(y_scale); + let c_scale_v = vdupq_n_s32(c_scale); + let bias_v = vdupq_n_s16(bias as i16); + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + let cru = vdupq_n_s32(coeffs.r_u()); + let crv = vdupq_n_s32(coeffs.r_v()); + let cgu = vdupq_n_s32(coeffs.g_u()); + let cgv = vdupq_n_s32(coeffs.g_v()); + let cbu = vdupq_n_s32(coeffs.b_u()); + let cbv = vdupq_n_s32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_neon(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = vreinterpretq_s16_u16(y_vec); + + // Subtract chroma bias (e.g. 512 for 10‑bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = vsubq_s16(vreinterpretq_s16_u16(u_vec), bias_v); + let v_i16 = vsubq_s16(vreinterpretq_s16_u16(v_vec), bias_v); + + // Widen 8‑lane i16 chroma to two i32x4 halves for the Q15 + // multiplies. Only lanes 0..3 of `_lo` are valid; `_hi` is + // entirely don't-care (duplicate of `_lo`). We feed both + // halves through `chroma_i16x8` to recycle the helper exactly; + // the don't-care output lanes are discarded by `vzip1q_s16` + // below (which only consumes lanes 0..3). + let u_lo_i32 = vmovl_s16(vget_low_s16(u_i16)); + let u_hi_i32 = vmovl_s16(vget_high_s16(u_i16)); + let v_lo_i32 = vmovl_s16(vget_low_s16(v_i16)); + let v_hi_i32 = vmovl_s16(vget_high_s16(v_i16)); + + let u_d_lo = q15_shift(vaddq_s32(vmulq_s32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(vaddq_s32(vmulq_s32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(vaddq_s32(vmulq_s32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(vaddq_s32(vmulq_s32(v_hi_i32, c_scale_v), rnd_v)); + + // 8‑lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `vzip1q_s16` so lanes 0..7 of `r_dup` align with Y0..Y7. + // `vzip1q_s16` interleaves the low 4 lanes of each operand: + // [c0, c0, c1, c1, c2, c2, c3, c3] + let r_dup = vzip1q_s16(r_chroma, r_chroma); + let g_dup = vzip1q_s16(g_chroma, g_chroma); + let b_dup = vzip1q_s16(b_chroma, b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. 8 valid lanes per channel. + let r_u8 = vqmovun_s16(vqaddq_s16(y_scaled, r_dup)); + let g_u8 = vqmovun_s16(vqaddq_s16(y_scaled, g_dup)); + let b_u8 = vqmovun_s16(vqaddq_s16(y_scaled, b_dup)); + + if ALPHA { + let alpha = vdup_n_u8(0xFF); + vst4_u8( + out.as_mut_ptr().add(x * 4), + uint8x8x4_t(r_u8, g_u8, b_u8, alpha), + ); + } else { + vst3_u8(out.as_mut_ptr().add(x * 3), uint8x8x3_t(r_u8, g_u8, b_u8)); + } + + x += 8; } - - x += 8; } - // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // Scalar tail — remaining < 8 pixels (always even per 4:2:2), or + // full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -218,10 +227,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row`. +/// `scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -231,7 +241,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -257,71 +271,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -335,9 +352,9 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row> (BITS - 8)` after the `>> (16 - BITS)` MSB‑alignment, i.e. /// a single `>> 8` from the raw u16 sample. Bypasses the YUV → RGB -/// pipeline entirely. +/// pipeline entirely. `BE = true` bypasses NEON and uses scalar. /// -/// Byte‑identical to `scalar::y2xx_n_to_luma_row::`. +/// Byte‑identical to `scalar::y2xx_n_to_luma_row::`. /// /// # Safety /// @@ -347,7 +364,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -365,29 +382,32 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - while x + 8 <= width { - // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples - // (still MSB‑aligned at BITS ≤ 12, low bits zero). - let pair = vld2q_u16(packed.as_ptr().add(x * 2)); - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` - // for any BITS ∈ {10, 12} — the constant fold gives the same - // result whether we shift in two stages or one. - let y_u8 = vshrn_n_u16::<8>(pair.0); - vst1_u8(luma_out.as_mut_ptr().add(x), y_u8); - x += 8; + if !BE { + while x + 8 <= width { + // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples + // (still MSB‑aligned at BITS ≤ 12, low bits zero). + let pair = vld2q_u16(packed.as_ptr().add(x * 2)); + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` + // for any BITS ∈ {10, 12} — the constant fold gives the same + // result whether we shift in two stages or one. + let y_u8 = vshrn_n_u16::<8>(pair.0); + vst1_u8(luma_out.as_mut_ptr().add(x), y_u8); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } /// NEON Y2xx → native‑depth `u16` luma (low‑bit‑packed). Each output /// `u16` carries the source's BITS-bit Y value in its low BITS bits. -/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::`. +/// `BE = true` bypasses NEON and uses scalar. +/// Byte‑identical to `scalar::y2xx_n_to_luma_u16_row::`. /// /// # Safety /// @@ -397,7 +417,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -414,21 +434,23 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); let mut x = 0usize; - while x + 8 <= width { - let pair = vld2q_u16(packed.as_ptr().add(x * 2)); - // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples - // into low‑bit‑packed form for the native‑depth u16 output. - let y_low = vshlq_u16(pair.0, shr_count); - vst1q_u16(luma_out.as_mut_ptr().add(x), y_low); - x += 8; + if !BE { + let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); + while x + 8 <= width { + let pair = vld2q_u16(packed.as_ptr().add(x * 2)); + // Right‑shift by `(16 - BITS)` to bring MSB‑aligned samples + // into low‑bit‑packed form for the native‑depth u16 output. + let y_low = vshlq_u16(pair.0, shr_count); + vst1q_u16(luma_out.as_mut_ptr().add(x), y_low); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/tests/v210.rs b/src/row/arch/wasm_simd128/tests/v210.rs index ac7455c2..d4d51116 100644 --- a/src/row/arch/wasm_simd128/tests/v210.rs +++ b/src/row/arch/wasm_simd128/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v210→luma u16 diverges (width={width})"); } @@ -227,7 +227,7 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma_out = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma_out, W); + v210_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -239,9 +239,15 @@ fn wasm_simd128_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/tests/y216.rs b/src/row/arch/wasm_simd128/tests/y216.rs index 8441d72c..034f029b 100644 --- a/src/row/arch/wasm_simd128/tests/y216.rs +++ b/src/row/arch/wasm_simd128/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -29,9 +29,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -43,9 +43,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -57,9 +57,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_y216(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -71,9 +71,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y216→luma u8 diverges (width={width})"); } @@ -82,9 +82,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y216→luma u16 diverges (width={width})"); } @@ -183,7 +183,7 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -195,9 +195,15 @@ fn wasm_simd128_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/tests/y2xx.rs b/src/row/arch/wasm_simd128/tests/y2xx.rs index 08a484ce..ad31d2f1 100644 --- a/src/row/arch/wasm_simd128/tests/y2xx.rs +++ b/src/row/arch/wasm_simd128/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -101,9 +101,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -115,9 +115,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -129,9 +129,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -143,9 +145,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -157,9 +161,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -168,9 +172,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -251,15 +255,15 @@ fn wasm_simd128_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "simd128 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -267,7 +271,7 @@ fn wasm_simd128_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -282,17 +286,17 @@ fn wasm_simd128_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "simd128 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "simd128 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/wasm_simd128/v210.rs b/src/row/arch/wasm_simd128/v210.rs index dba59ca9..264ca1c4 100644 --- a/src/row/arch/wasm_simd128/v210.rs +++ b/src/row/arch/wasm_simd128/v210.rs @@ -16,7 +16,7 @@ use core::arch::wasm32::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks one 16-byte v210 word into three `v128` vectors holding @@ -45,11 +45,11 @@ use crate::{ColorMatrix, row::scalar}; /// wasm). #[inline] #[target_feature(enable = "simd128")] -unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { +unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { // SAFETY: caller obligation — `ptr` has 16 bytes readable; simd128 // is enabled at compile time. unsafe { - let words = v128_load(ptr.cast()); + let words = load_endian_u32x4::(ptr); let mask10 = i32x4_splat(0x3FF); let low10 = v128_and(words, mask10); let mid10 = v128_and(u32x4_shr(words, 10), mask10); @@ -146,7 +146,7 @@ unsafe fn unpack_v210_word_wasm(ptr: *const u8) -> (v128, v128, v128) { /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -183,7 +183,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = i32x4_splat(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; @@ -270,7 +270,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -287,7 +293,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -324,7 +330,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = i32x4_splat(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; let u_i16 = i16x8_sub(u_vec, bias_v); @@ -391,7 +397,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -414,7 +420,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -424,7 +434,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x16 via // saturating narrow (Y ≤ 1023 stays well inside [0, 255] post-shift). let y_shr = u16x8_shr(y_vec, 2); @@ -439,7 +449,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -456,7 +466,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -466,7 +480,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_wasm(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_wasm::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; v128_store(tmp.as_mut_ptr().cast(), y_vec); @@ -477,7 +491,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/y216.rs b/src/row/arch/wasm_simd128/y216.rs index 5beb78f2..7bdf6363 100644 --- a/src/row/arch/wasm_simd128/y216.rs +++ b/src/row/arch/wasm_simd128/y216.rs @@ -107,7 +107,7 @@ unsafe fn unpack_y216_8px_wasm(ptr: *const u16) -> (v128, v128, v128) { /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -124,102 +124,111 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = i32x4_splat(RND); - let y_off32_v = i32x4_splat(y_off); - let y_scale_v = i32x4_splat(y_scale); - let c_scale_v = i32x4_splat(c_scale); - // Bias = 32768 = 0x8000; as i16 this wraps to -32768. - // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly - // maps full-u16 chroma [0, 65535] to [-32768, 32767]. - let bias16_v = i16x8_splat(-32768i16); - let alpha_u8 = u8x16_splat(0xFF); - let cru = i32x4_splat(coeffs.r_u()); - let crv = i32x4_splat(coeffs.r_v()); - let cgu = i32x4_splat(coeffs.g_u()); - let cgv = i32x4_splat(coeffs.g_v()); - let cbu = i32x4_splat(coeffs.b_u()); - let cbv = i32x4_splat(coeffs.b_v()); - let mut x = 0usize; - // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15). - while x + 16 <= width { - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16)); - - // Chroma bias subtraction (wrapping trick for full-u16 range). - let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); - let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v); - let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); - let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v); - - // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid. - // Hi halves hold zeros (from the swizzle mask) — don't-care since - // `chroma_i16x8` discards lanes 4..7 after `dup_lo`. - let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16); - let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16); - let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16); - let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16); - let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16); - let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16); - let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16); - let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16); - - // Q15 chroma scale → i32x4 (scaled chroma deltas). - let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v)); - let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v)); - let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v)); - let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v)); - let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v)); - let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v)); - let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v)); - let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v)); - - // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care). - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); - - // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample). - let r_dup_lo = dup_lo(r_chroma_lo); - let g_dup_lo = dup_lo(g_chroma_lo); - let b_dup_lo = dup_lo(b_chroma_lo); - let r_dup_hi = dup_lo(r_chroma_hi); - let g_dup_hi = dup_lo(g_chroma_hi); - let b_dup_hi = dup_lo(b_chroma_hi); - - // Y scale via unsigned widening (Y216 has full u16 range; i16 would - // overflow for Y > 32767). - let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v); - - // Saturating add → saturating narrow to u8x16. - let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo); - let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi); - let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo); - let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi); - let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo); - let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi); - let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); - let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); - let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); - - if ALPHA { - write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = i32x4_splat(RND); + let y_off32_v = i32x4_splat(y_off); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + // Bias = 32768 = 0x8000; as i16 this wraps to -32768. + // Using the wrapping trick (i16x8_sub with bias16 = -32768) correctly + // maps full-u16 chroma [0, 65535] to [-32768, 32767]. + let bias16_v = i16x8_splat(-32768i16); + let alpha_u8 = u8x16_splat(0xFF); + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + // 16 px/iter: two groups of 8 (lo = Y0..Y7, hi = Y8..Y15). + while x + 16 <= width { + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); + let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2 + 16)); + + // Chroma bias subtraction (wrapping trick for full-u16 range). + let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); + let v_lo_i16 = i16x8_sub(v_lo_vec, bias16_v); + let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); + let v_hi_i16 = i16x8_sub(v_hi_vec, bias16_v); + + // Widen to i32x4 halves; only lo halves (lanes 0..3) are valid. + // Hi halves hold zeros (from the swizzle mask) — don't-care since + // `chroma_i16x8` discards lanes 4..7 after `dup_lo`. + let u_lo_lo = i32x4_extend_low_i16x8(u_lo_i16); + let u_lo_hi = i32x4_extend_high_i16x8(u_lo_i16); + let v_lo_lo = i32x4_extend_low_i16x8(v_lo_i16); + let v_lo_hi = i32x4_extend_high_i16x8(v_lo_i16); + let u_hi_lo = i32x4_extend_low_i16x8(u_hi_i16); + let u_hi_hi = i32x4_extend_high_i16x8(u_hi_i16); + let v_hi_lo = i32x4_extend_low_i16x8(v_hi_i16); + let v_hi_hi = i32x4_extend_high_i16x8(v_hi_i16); + + // Q15 chroma scale → i32x4 (scaled chroma deltas). + let u_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_lo, c_scale_v), rnd_v)); + let u_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(u_lo_hi, c_scale_v), rnd_v)); + let v_d_lo_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_lo, c_scale_v), rnd_v)); + let v_d_lo_hi = q15_shift(i32x4_add(i32x4_mul(v_lo_hi, c_scale_v), rnd_v)); + let u_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(u_hi_lo, c_scale_v), rnd_v)); + let u_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_hi, c_scale_v), rnd_v)); + let v_d_hi_lo = q15_shift(i32x4_add(i32x4_mul(v_hi_lo, c_scale_v), rnd_v)); + let v_d_hi_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_hi, c_scale_v), rnd_v)); + + // 8-lane i16 chroma vectors (valid in lanes 0..3; lanes 4..7 don't-care). + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo_lo, v_d_lo_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi_lo, v_d_hi_lo, u_d_hi_hi, v_d_hi_hi, rnd_v); + + // Duplicate chroma into Y-pair slots (4:2:2 nearest-neighbor upsample). + let r_dup_lo = dup_lo(r_chroma_lo); + let g_dup_lo = dup_lo(g_chroma_lo); + let b_dup_lo = dup_lo(b_chroma_lo); + let r_dup_hi = dup_lo(r_chroma_hi); + let g_dup_hi = dup_lo(g_chroma_hi); + let b_dup_hi = dup_lo(b_chroma_hi); + + // Y scale via unsigned widening (Y216 has full u16 range; i16 would + // overflow for Y > 32767). + let y_lo_scaled = scale_y_u16_wasm(y_lo_vec, y_off32_v, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_u16_wasm(y_hi_vec, y_off32_v, y_scale_v, rnd_v); + + // Saturating add → saturating narrow to u8x16. + let r_lo = i16x8_add_sat(y_lo_scaled, r_dup_lo); + let r_hi = i16x8_add_sat(y_hi_scaled, r_dup_hi); + let g_lo = i16x8_add_sat(y_lo_scaled, g_dup_lo); + let g_hi = i16x8_add_sat(y_hi_scaled, g_dup_hi); + let b_lo = i16x8_add_sat(y_lo_scaled, b_dup_lo); + let b_hi = i16x8_add_sat(y_hi_scaled, b_dup_hi); + let r_u8 = u8x16_narrow_i16x8(r_lo, r_hi); + let g_u8 = u8x16_narrow_i16x8(g_lo, g_hi); + let b_u8 = u8x16_narrow_i16x8(b_lo, b_hi); + + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + x += 16; } - x += 16; } // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -237,7 +246,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -255,101 +264,104 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND_I32: i32 = 1 << 14; unsafe { - let alpha_u16 = u16x8_splat(0xFFFF); - let rnd_i64 = i64x2_splat(RND_I64); - let rnd_i32 = i32x4_splat(RND_I32); - let y_off32 = i32x4_splat(y_off); - let y_scale_i64 = i64x2_splat(y_scale as i64); - let c_scale_i32 = i32x4_splat(c_scale); - // Wrapping 0x8000 bias trick for full-u16 chroma. - let bias16 = i16x8_splat(-32768i16); - // Coefficients widened once to i64x2. - let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u())); - let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v())); - let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u())); - let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v())); - let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u())); - let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v())); - let mut x = 0usize; - // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs. - while x + 8 <= width { - let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); - - // Chroma bias (wrapping trick). - let u_i16 = i16x8_sub(u_vec, bias16); - let v_i16 = i16x8_sub(v_vec, bias16); - - // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares). - let u_i32 = i32x4_extend_low_i16x8(u_i16); - let v_i32 = i32x4_extend_low_i16x8(v_i16); - - // Q15 scale → 4 × i32 chroma deltas. - let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15); - let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15); - - // Widen to 2 × i64x2 for i64 chroma pipeline. - let u_d_lo = i64x2_extend_low_i32x4(u_d); - let u_d_hi = i64x2_extend_high_i32x4(u_d); - let v_d_lo = i64x2_extend_low_i32x4(v_d); - let v_d_hi = i64x2_extend_high_i32x4(v_d); - - let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64); - let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64); - let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64); - let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64); - let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64); - let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64); - - // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3]. - let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi); - let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi); - let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi); - - // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). - // chroma_dup_i32x4_u16([c0,c1,c2,c3]) → - // lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3] - let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32); - let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32); - let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32); - - // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64. - let y_lo_u32 = u32x4_extend_low_u16x8(y_vec); - let y_hi_u32 = u32x4_extend_high_u16x8(y_vec); - let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32); - let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32); - - let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64); - let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64); - - // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]). - let r_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, r_dup_lo), - i32x4_add(y_hi_scaled, r_dup_hi), - ); - let g_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, g_dup_lo), - i32x4_add(y_hi_scaled, g_dup_hi), - ); - let b_u16 = u16x8_narrow_i32x4( - i32x4_add(y_lo_scaled, b_dup_lo), - i32x4_add(y_hi_scaled, b_dup_hi), - ); - - if ALPHA { - write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = u16x8_splat(0xFFFF); + let rnd_i64 = i64x2_splat(RND_I64); + let rnd_i32 = i32x4_splat(RND_I32); + let y_off32 = i32x4_splat(y_off); + let y_scale_i64 = i64x2_splat(y_scale as i64); + let c_scale_i32 = i32x4_splat(c_scale); + // Wrapping 0x8000 bias trick for full-u16 chroma. + let bias16 = i16x8_splat(-32768i16); + // Coefficients widened once to i64x2. + let cru = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_u())); + let crv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.r_v())); + let cgu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_u())); + let cgv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.g_v())); + let cbu = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_u())); + let cbv = i64x2_extend_low_i32x4(i32x4_splat(coeffs.b_v())); + + // 8 px/iter: one call to unpack_y216_8px_wasm gives Y0..Y7 and 4 UV pairs. + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y216_8px_wasm(packed.as_ptr().add(x * 2)); + + // Chroma bias (wrapping trick). + let u_i16 = i16x8_sub(u_vec, bias16); + let v_i16 = i16x8_sub(v_vec, bias16); + + // Widen low 4 lanes to i32x4 (high 4 are zeroed don't-cares). + let u_i32 = i32x4_extend_low_i16x8(u_i16); + let v_i32 = i32x4_extend_low_i16x8(v_i16); + + // Q15 scale → 4 × i32 chroma deltas. + let u_d = i32x4_shr(i32x4_add(i32x4_mul(u_i32, c_scale_i32), rnd_i32), 15); + let v_d = i32x4_shr(i32x4_add(i32x4_mul(v_i32, c_scale_i32), rnd_i32), 15); + + // Widen to 2 × i64x2 for i64 chroma pipeline. + let u_d_lo = i64x2_extend_low_i32x4(u_d); + let u_d_hi = i64x2_extend_high_i32x4(u_d); + let v_d_lo = i64x2_extend_low_i32x4(v_d); + let v_d_hi = i64x2_extend_high_i32x4(v_d); + + let r_ch_lo = chroma_i64x2_wasm(cru, crv, u_d_lo, v_d_lo, rnd_i64); + let r_ch_hi = chroma_i64x2_wasm(cru, crv, u_d_hi, v_d_hi, rnd_i64); + let g_ch_lo = chroma_i64x2_wasm(cgu, cgv, u_d_lo, v_d_lo, rnd_i64); + let g_ch_hi = chroma_i64x2_wasm(cgu, cgv, u_d_hi, v_d_hi, rnd_i64); + let b_ch_lo = chroma_i64x2_wasm(cbu, cbv, u_d_lo, v_d_lo, rnd_i64); + let b_ch_hi = chroma_i64x2_wasm(cbu, cbv, u_d_hi, v_d_hi, rnd_i64); + + // Combine each i64x2 pair → i32x4 [c0, c1, c2, c3]. + let r_ch_i32 = combine_i64x2_pair_to_i32x4(r_ch_lo, r_ch_hi); + let g_ch_i32 = combine_i64x2_pair_to_i32x4(g_ch_lo, g_ch_hi); + let b_ch_i32 = combine_i64x2_pair_to_i32x4(b_ch_lo, b_ch_hi); + + // Duplicate 4 chroma values into 8 per-pixel slots (4:2:2). + // chroma_dup_i32x4_u16([c0,c1,c2,c3]) → + // lo = [c0,c0,c1,c1], hi = [c2,c2,c3,c3] + let (r_dup_lo, r_dup_hi) = chroma_dup_i32x4_u16(r_ch_i32); + let (g_dup_lo, g_dup_hi) = chroma_dup_i32x4_u16(g_ch_i32); + let (b_dup_lo, b_dup_hi) = chroma_dup_i32x4_u16(b_ch_i32); + + // Y: unsigned widen 8 u16 → 2 × i32x4, subtract y_off, scale in i64. + let y_lo_u32 = u32x4_extend_low_u16x8(y_vec); + let y_hi_u32 = u32x4_extend_high_u16x8(y_vec); + let y_lo_i32 = i32x4_sub(y_lo_u32, y_off32); + let y_hi_i32 = i32x4_sub(y_hi_u32, y_off32); + + let y_lo_scaled = scale_y_i32x4_i64_wasm(y_lo_i32, y_scale_i64, rnd_i64); + let y_hi_scaled = scale_y_i32x4_i64_wasm(y_hi_i32, y_scale_i64, rnd_i64); + + // Add Y + chroma, saturating narrow i32 → u16 (clamps [0, 65535]). + let r_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, r_dup_lo), + i32x4_add(y_hi_scaled, r_dup_hi), + ); + let g_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, g_dup_lo), + i32x4_add(y_hi_scaled, g_dup_hi), + ); + let b_u16 = u16x8_narrow_i32x4( + i32x4_add(y_lo_scaled, b_dup_lo), + i32x4_add(y_hi_scaled, b_dup_hi), + ); + + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + x += 8; } - x += 8; } // Scalar tail. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -373,48 +385,56 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(luma_out.len() >= width); unsafe { - // Y permute: even u16 lanes → low 8 bytes; zeroed high. - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - // 16 px/iter: two groups of 8 Y samples. - while x + 16 <= width { - // lo group: Y0..Y7 from bytes x*2 .. x*2+32. - let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); - let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo0 = u8x16_swizzle(lo0, y_idx); - let y_lo1 = u8x16_swizzle(lo1, y_idx); - let y_lo = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); - - // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64. - let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); - let y_hi0 = u8x16_swizzle(hi0, y_idx); - let y_hi1 = u8x16_swizzle(hi1, y_idx); - let y_hi = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); - - // >> 8: extract high byte of each u16 Y sample. - let y_shr_lo = u16x8_shr(y_lo, 8); - let y_shr_hi = u16x8_shr(y_hi, 8); - // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255). - let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi); - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8); - x += 16; + if !BE { + // Y permute: even u16 lanes → low 8 bytes; zeroed high. + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + // 16 px/iter: two groups of 8 Y samples. + while x + 16 <= width { + // lo group: Y0..Y7 from bytes x*2 .. x*2+32. + let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); + let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo0 = u8x16_swizzle(lo0, y_idx); + let y_lo1 = u8x16_swizzle(lo1, y_idx); + let y_lo = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); + + // hi group: Y8..Y15 from bytes x*2+32 .. x*2+64. + let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); + let y_hi0 = u8x16_swizzle(hi0, y_idx); + let y_hi1 = u8x16_swizzle(hi1, y_idx); + let y_hi = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); + + // >> 8: extract high byte of each u16 Y sample. + let y_shr_lo = u16x8_shr(y_lo, 8); + let y_shr_hi = u16x8_shr(y_hi, 8); + // Narrow 16 i16 → 16 u8 (no saturation needed; values ≤ 255). + let y_u8 = u8x16_narrow_i16x8(y_shr_lo, y_shr_hi); + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_u8); + x += 16; + } } + // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -432,44 +452,52 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(luma_out.len() >= width); unsafe { - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift). - while x + 16 <= width { - // lo group: Y0..Y7 - let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); - let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo0 = u8x16_swizzle(lo0, y_idx); - let y_lo1 = u8x16_swizzle(lo1, y_idx); - let y_lo = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); - - // hi group: Y8..Y15 - let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); - let y_hi0 = u8x16_swizzle(hi0, y_idx); - let y_hi1 = u8x16_swizzle(hi1, y_idx); - let y_hi = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); - - // Direct store — full 16-bit Y, no shift. - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo); - v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi); - x += 16; + if !BE { + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift). + while x + 16 <= width { + // lo group: Y0..Y7 + let lo0 = v128_load(packed.as_ptr().add(x * 2).cast()); + let lo1 = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo0 = u8x16_swizzle(lo0, y_idx); + let y_lo1 = u8x16_swizzle(lo1, y_idx); + let y_lo = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo0, y_lo1); + + // hi group: Y8..Y15 + let hi0 = v128_load(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = v128_load(packed.as_ptr().add(x * 2 + 24).cast()); + let y_hi0 = u8x16_swizzle(hi0, y_idx); + let y_hi1 = u8x16_swizzle(hi1, y_idx); + let y_hi = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_hi0, y_hi1); + + // Direct store — full 16-bit Y, no shift. + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo); + v128_store(luma_out.as_mut_ptr().add(x + 8).cast(), y_hi); + x += 16; + } } + // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/wasm_simd128/y2xx.rs b/src/row/arch/wasm_simd128/y2xx.rs index 91e77803..83c4a6eb 100644 --- a/src/row/arch/wasm_simd128/y2xx.rs +++ b/src/row/arch/wasm_simd128/y2xx.rs @@ -137,7 +137,11 @@ unsafe fn unpack_y2xx_8px_wasm(ptr: *const u16, shr_count: u32) -> (v128, v128, /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -165,112 +169,115 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits - // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low - // 8 bytes of the result hold the saturated u8 of the input - // i16x8. Only the first 8 bytes per channel matter. - let r_sum = i16x8_add_sat(y_scaled, r_dup); - let g_sum = i16x8_add_sat(y_scaled, g_dup); - let b_sum = i16x8_add_sat(y_scaled, b_dup); - let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum); - let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum); - let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum); - - // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] / - // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so - // for the 8-px-iter body we use the v210-style stack-buffer + - // scalar interleave pattern. (8 px × 3 = 24 bytes RGB, - // 8 px × 4 = 32 bytes RGBA.) - let mut r_tmp = [0u8; 16]; - let mut g_tmp = [0u8; 16]; - let mut b_tmp = [0u8; 16]; - v128_store(r_tmp.as_mut_ptr().cast(), r_u8); - v128_store(g_tmp.as_mut_ptr().cast(), g_u8); - v128_store(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 8 * 4]; - for i in 0..8 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 8 * 3]; - for i in 0..8 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = i32x4_splat(RND); + let y_off_v = i16x8_splat(y_off as i16); + let y_scale_v = i32x4_splat(y_scale); + let c_scale_v = i32x4_splat(c_scale); + let bias_v = i16x8_splat(bias as i16); + // Loop-invariant runtime shift count for `u16x8_shr`, see + // module-level note. + let shr_count: u32 = 16 - BITS; + let cru = i32x4_splat(coeffs.r_u()); + let crv = i32x4_splat(coeffs.r_v()); + let cgu = i32x4_splat(coeffs.g_u()); + let cgv = i32x4_splat(coeffs.g_v()); + let cbu = i32x4_splat(coeffs.b_u()); + let cbv = i32x4_splat(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_wasm(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = i16x8_sub(u_vec, bias_v); + let v_i16 = i16x8_sub(v_vec, bias_v); + + // Widen 8-lane i16 chroma to two i32x4 halves so the Q15 + // multiplies don't overflow. Only lanes 0..3 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x8` to recycle the helper exactly; the + // don't-care output lanes are discarded by the [`dup_lo`] + // duplicate step below (which only consumes lanes 0..3). + let u_lo_i32 = i32x4_extend_low_i16x8(u_i16); + let u_hi_i32 = i32x4_extend_high_i16x8(u_i16); + let v_lo_i32 = i32x4_extend_low_i16x8(v_i16); + let v_hi_i32 = i32x4_extend_high_i16x8(v_i16); + + let u_d_lo = q15_shift(i32x4_add(i32x4_mul(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(i32x4_add(i32x4_mul(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(i32x4_add(i32x4_mul(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(i32x4_add(i32x4_mul(v_hi_i32, c_scale_v), rnd_v)); + + // 8-lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // [`dup_lo`] so lanes 0..7 of `r_dup` align with Y0..Y7. Lane + // order: [c0, c0, c1, c1, c2, c2, c3, c3]. + let r_dup = dup_lo(r_chroma); + let g_dup = dup_lo(g_chroma); + let b_dup = dup_lo(b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `u8x16_narrow_i16x8(lo, hi)` emits + // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` so the low + // 8 bytes of the result hold the saturated u8 of the input + // i16x8. Only the first 8 bytes per channel matter. + let r_sum = i16x8_add_sat(y_scaled, r_dup); + let g_sum = i16x8_add_sat(y_scaled, g_dup); + let b_sum = i16x8_add_sat(y_scaled, b_dup); + let r_u8 = u8x16_narrow_i16x8(r_sum, r_sum); + let g_u8 = u8x16_narrow_i16x8(g_sum, g_sum); + let b_u8 = u8x16_narrow_i16x8(b_sum, b_sum); + + // 8-pixel partial store: wasm-simd128's [`write_rgb_16`] / + // [`write_rgba_16`] emit 16-pixel output (48 / 64 bytes), so + // for the 8-px-iter body we use the v210-style stack-buffer + + // scalar interleave pattern. (8 px × 3 = 24 bytes RGB, + // 8 px × 4 = 32 bytes RGBA.) + let mut r_tmp = [0u8; 16]; + let mut g_tmp = [0u8; 16]; + let mut b_tmp = [0u8; 16]; + v128_store(r_tmp.as_mut_ptr().cast(), r_u8); + v128_store(g_tmp.as_mut_ptr().cast(), g_u8); + v128_store(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 8 * 4]; + for i in 0..8 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 8 * 3]; + for i in 0..8 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 8; + x += 8; + } } // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -296,7 +303,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -322,72 +333,76 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -413,7 +428,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -430,40 +445,44 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Y permute mask: pick even u16 lanes (low byte at [0], high byte - // at [1]) into the low 8 bytes; high 8 bytes zeroed. - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = v128_load(packed.as_ptr().add(x * 2).cast()); - let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _] - let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _] - // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as - // the 4:2:2 unpack helper. - let y_vec = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`. - let y_shr = u16x8_shr(y_vec, 8); - // Pack 8 i16 lanes to u8 — only low 8 bytes used. - let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr); - // Store low 8 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 16]; - v128_store(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 8].copy_from_slice(&tmp[..8]); - - x += 8; + if !BE { + // Y permute mask: pick even u16 lanes (low byte at [0], high byte + // at [1]) into the low 8 bytes; high 8 bytes zeroed. + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = v128_load(packed.as_ptr().add(x * 2).cast()); + let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = u8x16_swizzle(lo, y_idx); // [Y0..Y3, _, _, _, _] + let y_hi = u8x16_swizzle(hi, y_idx); // [Y4..Y7, _, _, _, _] + // Concatenate low halves: same `_mm_unpacklo_epi64` pattern as + // the 4:2:2 unpack helper. + let y_vec = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); // [Y0..Y7] MSB-aligned + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>` and SSE4.1's `_mm_srli_epi16::<8>`. + let y_shr = u16x8_shr(y_vec, 8); + // Pack 8 i16 lanes to u8 — only low 8 bytes used. + let y_u8 = u8x16_narrow_i16x8(y_shr, y_shr); + // Store low 8 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 16]; + v128_store(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 8].copy_from_slice(&tmp[..8]); + + x += 8; + } } + // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -480,7 +499,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -497,29 +516,33 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count: u32 = 16 - BITS; - let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = v128_load(packed.as_ptr().add(x * 2).cast()); - let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = u8x16_swizzle(lo, y_idx); - let y_hi = u8x16_swizzle(hi, y_idx); - let y_vec = - i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = u16x8_shr(y_vec, shr_count); - v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 8; + if !BE { + let shr_count: u32 = 16 - BITS; + let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = v128_load(packed.as_ptr().add(x * 2).cast()); + let hi = v128_load(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = u8x16_swizzle(lo, y_idx); + let y_hi = u8x16_swizzle(hi, y_idx); + let y_vec = + i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23>(y_lo, y_hi); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = u16x8_shr(y_vec, shr_count); + v128_store(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 8; + } } + // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/tests/v210.rs b/src/row/arch/x86_avx2/tests/v210.rs index 9c1f8315..d6bf96ae 100644 --- a/src/row/arch/x86_avx2/tests/v210.rs +++ b/src/row/arch/x86_avx2/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v210→luma u16 diverges (width={width})"); } @@ -238,7 +238,7 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx2 v210 luma reorder bug"); @@ -247,9 +247,15 @@ fn avx2_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx2/tests/y216.rs b/src/row/arch/x86_avx2/tests/y216.rs index f7428a32..34cd1b89 100644 --- a/src/row/arch/x86_avx2/tests/y216.rs +++ b/src/row/arch/x86_avx2/tests/y216.rs @@ -16,9 +16,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -33,9 +33,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -49,9 +49,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y216→luma u8 diverges (width={width})"); } @@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y216→luma u16 diverges (width={width})"); } @@ -169,7 +169,7 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "AVX2 y216 luma_u16 reorder bug"); @@ -178,9 +178,15 @@ fn avx2_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx2/tests/y2xx.rs b/src/row/arch/x86_avx2/tests/y2xx.rs index de7fcd45..26825f38 100644 --- a/src/row/arch/x86_avx2/tests/y2xx.rs +++ b/src/row/arch/x86_avx2/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -107,9 +107,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -121,9 +121,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -135,9 +135,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -149,9 +151,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -163,9 +167,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -174,9 +178,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 y2xx<{BITS}>→luma u16 diverges (width={width})"); } @@ -262,15 +266,15 @@ fn avx2_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "AVX2 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -278,7 +282,7 @@ fn avx2_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -290,17 +294,17 @@ fn avx2_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "AVX2 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "AVX2 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_avx2/v210.rs b/src/row/arch/x86_avx2/v210.rs index 49407edd..13164309 100644 --- a/src/row/arch/x86_avx2/v210.rs +++ b/src/row/arch/x86_avx2/v210.rs @@ -34,7 +34,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x8, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks two consecutive 16-byte v210 words (= 12 pixels) into @@ -63,11 +63,11 @@ use crate::{ColorMatrix, row::scalar}; /// `target_feature` includes AVX2 (which implies AVX, SSSE3, etc.). #[inline] #[target_feature(enable = "avx2")] -unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) { +unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) { // SAFETY: caller obligation — `ptr` has 32 bytes readable; AVX2 // (and thus SSSE3) is available. unsafe { - let words = _mm256_loadu_si256(ptr.cast()); + let words = load_endian_u32x8::(ptr); let mask10 = _mm256_set1_epi32(0x3FF); let low10 = _mm256_and_si256(words, mask10); let mid10 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask10); @@ -224,7 +224,7 @@ unsafe fn unpack_v210_2words_avx2(ptr: *const u8) -> (__m256i, __m256i, __m256i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -263,7 +263,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( // Main loop: 12 pixels (2 v210 words = 32 bytes) per iteration. let pairs = words / 2; for p in 0..pairs { - let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); let y_i16 = y_vec; @@ -369,7 +369,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -386,7 +392,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -424,7 +430,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let pairs = words / 2; for p in 0..pairs { - let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, u_vec, v_vec) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); let y_i16 = y_vec; let u_i16 = _mm256_sub_epi16(u_vec, bias_v); @@ -503,7 +509,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -526,7 +532,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -537,7 +547,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: unsafe { let pairs = words / 2; for p in 0..pairs { - let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, _, _) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x32 via packus. let y_shr = _mm256_srli_epi16::<2>(y_vec); let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); @@ -554,7 +564,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -571,7 +581,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -582,7 +596,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w unsafe { let pairs = words / 2; for p in 0..pairs { - let (y_vec, _, _) = unpack_v210_2words_avx2(packed.as_ptr().add(p * 32)); + let (y_vec, _, _) = unpack_v210_2words_avx2::(packed.as_ptr().add(p * 32)); // Store first 12 of the 16 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 16]; _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_vec); @@ -596,7 +610,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[pairs * 32..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/y216.rs b/src/row/arch/x86_avx2/y216.rs index 4184b3bb..cf850e18 100644 --- a/src/row/arch/x86_avx2/y216.rs +++ b/src/row/arch/x86_avx2/y216.rs @@ -109,7 +109,7 @@ unsafe fn unpack_y216_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -128,137 +128,146 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { - let rnd_v = _mm256_set1_epi32(RND); - // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off. - let y_off_v = _mm256_set1_epi32(y_off); - let y_scale_v = _mm256_set1_epi32(y_scale); - let c_scale_v = _mm256_set1_epi32(c_scale); - // Chroma bias: 32768 via wrapping 0x8000 = -32768i16. - let bias16_v = _mm256_set1_epi16(-32768i16); - let cru = _mm256_set1_epi32(coeffs.r_u()); - let crv = _mm256_set1_epi32(coeffs.r_v()); - let cgu = _mm256_set1_epi32(coeffs.g_u()); - let cgv = _mm256_set1_epi32(coeffs.g_v()); - let cbu = _mm256_set1_epi32(coeffs.b_u()); - let cbv = _mm256_set1_epi32(coeffs.b_v()); - let alpha_u8 = _mm256_set1_epi8(-1i8); - let mut x = 0usize; - while x + 32 <= width { - // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ------ - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); - - // Chroma bias subtraction (wrapping). - let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); - let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v); - - // Widen 8 valid chroma i16 lanes to two i32x8 halves. - // Only the low 128 bits of u_lo_vec carry valid U0..U7; - // the high 128 bits are zeroed by the 0x88 permute (don't-care). - let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16)); - let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16)); - let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16)); - let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16)); - - let u_d_lo_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_a, c_scale_v), - rnd_v, - )); - let u_d_lo_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_b, c_scale_v), - rnd_v, - )); - let v_d_lo_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_a, c_scale_v), - rnd_v, - )); - let v_d_lo_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_b, c_scale_v), - rnd_v, - )); - - // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo). - let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); - - // Duplicate each chroma into its 4:2:2 Y-pair slot. - // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used - // here since we have only 8 chroma samples per 16-px half. - let (r_dup_lo, _) = chroma_dup(r_chroma_lo); - let (g_dup_lo, _) = chroma_dup(g_chroma_lo); - let (b_dup_lo, _) = chroma_dup(b_chroma_lo); - - // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v); - - // --- hi group: pixels x+16..x+31 ----------------------------------- - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32)); - - let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); - let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v); - - let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16)); - let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16)); - let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16)); - let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16)); - - let u_d_hi_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_a, c_scale_v), - rnd_v, - )); - let u_d_hi_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_b, c_scale_v), - rnd_v, - )); - let v_d_hi_a = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_a, c_scale_v), - rnd_v, - )); - let v_d_hi_b = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_b, c_scale_v), - rnd_v, - )); - - let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); - - let (r_dup_hi, _) = chroma_dup(r_chroma_hi); - let (g_dup_hi, _) = chroma_dup(g_chroma_hi); - let (b_dup_hi, _) = chroma_dup(b_chroma_hi); - - let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v); - - // Saturating add + narrow to u8x32 (32 pixels per channel). - let r_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, r_dup_lo), - _mm256_adds_epi16(y_hi_scaled, r_dup_hi), - ); - let g_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, g_dup_lo), - _mm256_adds_epi16(y_hi_scaled, g_dup_hi), - ); - let b_u8 = narrow_u8x32( - _mm256_adds_epi16(y_lo_scaled, b_dup_lo), - _mm256_adds_epi16(y_hi_scaled, b_dup_hi), - ); + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off. + let y_off_v = _mm256_set1_epi32(y_off); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + // Chroma bias: 32768 via wrapping 0x8000 = -32768i16. + let bias16_v = _mm256_set1_epi16(-32768i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm256_set1_epi8(-1i8); + + while x + 32 <= width { + // --- lo group: pixels x..x+15 (two 256-bit loads, 16 pixels) ---- + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); + + // Chroma bias subtraction (wrapping). + let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); + let v_lo_i16 = _mm256_sub_epi16(v_lo_vec, bias16_v); + + // Widen 8 valid chroma i16 lanes to two i32x8 halves. + // Only the low 128 bits of u_lo_vec carry valid U0..U7; + // the high 128 bits are zeroed by the 0x88 permute (don't-care). + let u_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_lo_i16)); + let u_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_lo_i16)); + let v_lo_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_lo_i16)); + let v_lo_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_lo_i16)); + + let u_d_lo_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_a, c_scale_v), + rnd_v, + )); + let u_d_lo_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_b, c_scale_v), + rnd_v, + )); + let v_d_lo_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_a, c_scale_v), + rnd_v, + )); + let v_d_lo_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_b, c_scale_v), + rnd_v, + )); + + // chroma_i16x16: 16-lane vector with valid data in lanes 0..7 (lo). + let r_chroma_lo = chroma_i16x16(cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + let g_chroma_lo = chroma_i16x16(cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + let b_chroma_lo = chroma_i16x16(cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v); + + // Duplicate each chroma into its 4:2:2 Y-pair slot. + // chroma_dup returns (lo16, hi16); only lo16 (lanes 0..15) is used + // here since we have only 8 chroma samples per 16-px half. + let (r_dup_lo, _) = chroma_dup(r_chroma_lo); + let (g_dup_lo, _) = chroma_dup(g_chroma_lo); + let (b_dup_lo, _) = chroma_dup(b_chroma_lo); + + // Y scale: unsigned-widened to avoid i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16_avx2(y_lo_vec, y_off_v, y_scale_v, rnd_v); + + // --- hi group: pixels x+16..x+31 ----------------------------------- + let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2 + 32)); + + let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); + let v_hi_i16 = _mm256_sub_epi16(v_hi_vec, bias16_v); + + let u_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_hi_i16)); + let u_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_hi_i16)); + let v_hi_a = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_hi_i16)); + let v_hi_b = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_hi_i16)); + + let u_d_hi_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_a, c_scale_v), + rnd_v, + )); + let u_d_hi_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_b, c_scale_v), + rnd_v, + )); + let v_d_hi_a = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_a, c_scale_v), + rnd_v, + )); + let v_d_hi_b = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_b, c_scale_v), + rnd_v, + )); + + let r_chroma_hi = chroma_i16x16(cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + let g_chroma_hi = chroma_i16x16(cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + let b_chroma_hi = chroma_i16x16(cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v); + + let (r_dup_hi, _) = chroma_dup(r_chroma_hi); + let (g_dup_hi, _) = chroma_dup(g_chroma_hi); + let (b_dup_hi, _) = chroma_dup(b_chroma_hi); + + let y_hi_scaled = scale_y_u16_avx2(y_hi_vec, y_off_v, y_scale_v, rnd_v); + + // Saturating add + narrow to u8x32 (32 pixels per channel). + let r_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, r_dup_lo), + _mm256_adds_epi16(y_hi_scaled, r_dup_hi), + ); + let g_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, g_dup_lo), + _mm256_adds_epi16(y_hi_scaled, g_dup_hi), + ); + let b_u8 = narrow_u8x32( + _mm256_adds_epi16(y_lo_scaled, b_dup_lo), + _mm256_adds_epi16(y_hi_scaled, b_dup_hi), + ); - if ALPHA { - write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); - } + if ALPHA { + write_rgba_32(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_32(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } - x += 32; + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -280,7 +289,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -298,132 +307,135 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_v = _mm256_set1_epi64x(RND); - let rnd32_v = _mm256_set1_epi32(1 << 14); - let y_off_v = _mm256_set1_epi32(y_off); - let y_scale_v = _mm256_set1_epi32(y_scale); - let c_scale_v = _mm256_set1_epi32(c_scale); - // Chroma bias via wrapping 0x8000 trick. - let bias16_v = _mm256_set1_epi16(-32768i16); - let cru = _mm256_set1_epi32(coeffs.r_u()); - let crv = _mm256_set1_epi32(coeffs.r_v()); - let cgu = _mm256_set1_epi32(coeffs.g_u()); - let cgv = _mm256_set1_epi32(coeffs.g_v()); - let cbu = _mm256_set1_epi32(coeffs.b_u()); - let cbv = _mm256_set1_epi32(coeffs.b_v()); - let mut x = 0usize; - while x + 16 <= width { - // Two 256-bit loads → 16 pixels, 8 UV pairs. - let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); - - // Subtract chroma bias. - let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); - let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); - - // Widen 8 valid chroma i16 lanes to i32x8. - // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute. - let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); - let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - - // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32). - let u_d = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_i32, c_scale_v), - rnd32_v, - )); - let v_d = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_i32, c_scale_v), - rnd32_v, - )); - - // i64 chroma: even/odd i32 lanes via 0xF5 shuffle. - let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d); - let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v); - let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v); - let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v); - let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); - let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v); - let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); - - // Reassemble i64x4 pairs → i32x8 [c0..c7]. - let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd); - let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd); - let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd); - - // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2). - let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32); - let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32); - let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32); - - // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. - // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes. - let y_lo_u16 = _mm256_castsi256_si128(y_vec); - let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec); - let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v); - let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v); - - let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v); - let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v); - - // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup. - let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, r_dup_lo), - _mm256_add_epi32(y_hi_scaled, r_dup_hi), - )); - let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, g_dup_lo), - _mm256_add_epi32(y_hi_scaled, g_dup_hi), - )); - let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( - _mm256_add_epi32(y_lo_scaled, b_dup_lo), - _mm256_add_epi32(y_hi_scaled, b_dup_hi), - )); - - // Write 16 pixels via two 8-pixel helpers. - if ALPHA { - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - alpha_u16, - dst, - ); - write_rgba_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - alpha_u16, - dst.add(32), - ); - } else { - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r_u16), - _mm256_castsi256_si128(g_u16), - _mm256_castsi256_si128(b_u16), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r_u16), - _mm256_extracti128_si256::<1>(g_u16), - _mm256_extracti128_si256::<1>(b_u16), - dst.add(24), - ); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_v = _mm256_set1_epi64x(RND); + let rnd32_v = _mm256_set1_epi32(1 << 14); + let y_off_v = _mm256_set1_epi32(y_off); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + // Chroma bias via wrapping 0x8000 trick. + let bias16_v = _mm256_set1_epi16(-32768i16); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + // Two 256-bit loads → 16 pixels, 8 UV pairs. + let (y_vec, u_vec, v_vec) = unpack_y216_16px_avx2(packed.as_ptr().add(x * 2)); + + // Subtract chroma bias. + let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); + + // Widen 8 valid chroma i16 lanes to i32x8. + // Low 128 of u_vec / v_vec hold U0..U7 / V0..V7 after 0x88 permute. + let u_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let v_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + + // Scale UV in i32 (8 lanes; |chroma_centered × c_scale| fits i32). + let u_d = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_i32, c_scale_v), + rnd32_v, + )); + let v_d = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_i32, c_scale_v), + rnd32_v, + )); + + // i64 chroma: even/odd i32 lanes via 0xF5 shuffle. + let u_d_odd = _mm256_shuffle_epi32::<0xF5>(u_d); + let v_d_odd = _mm256_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x4_avx2(cru, crv, u_d, v_d, rnd_v); + let r_ch_odd = chroma_i64x4_avx2(cru, crv, u_d_odd, v_d_odd, rnd_v); + let g_ch_even = chroma_i64x4_avx2(cgu, cgv, u_d, v_d, rnd_v); + let g_ch_odd = chroma_i64x4_avx2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); + let b_ch_even = chroma_i64x4_avx2(cbu, cbv, u_d, v_d, rnd_v); + let b_ch_odd = chroma_i64x4_avx2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); + + // Reassemble i64x4 pairs → i32x8 [c0..c7]. + let r_ch_i32 = reassemble_i64x4_to_i32x8(r_ch_even, r_ch_odd); + let g_ch_i32 = reassemble_i64x4_to_i32x8(g_ch_even, g_ch_odd); + let b_ch_i32 = reassemble_i64x4_to_i32x8(b_ch_even, b_ch_odd); + + // Duplicate each of 8 chroma values into 2 per-pixel slots (4:2:2). + let (r_dup_lo, r_dup_hi) = chroma_dup_i32(r_ch_i32); + let (g_dup_lo, g_dup_hi) = chroma_dup_i32(g_ch_i32); + let (b_dup_lo, b_dup_hi) = chroma_dup_i32(b_ch_i32); + + // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. + // y_vec from unpack_y216_16px_avx2 is __m256i with 16 u16 lanes. + let y_lo_u16 = _mm256_castsi256_si128(y_vec); + let y_hi_u16 = _mm256_extracti128_si256::<1>(y_vec); + let y_lo_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_lo_u16), y_off_v); + let y_hi_i32 = _mm256_sub_epi32(_mm256_cvtepu16_epi32(y_hi_u16), y_off_v); + + let y_lo_scaled = scale_y_i32x8_i64(y_lo_i32, y_scale_v, rnd_v); + let y_hi_scaled = scale_y_i32x8_i64(y_hi_i32, y_scale_v, rnd_v); + + // Add Y + chroma, saturate to u16 via _mm256_packus_epi32 + 0xD8 fixup. + let r_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, r_dup_lo), + _mm256_add_epi32(y_hi_scaled, r_dup_hi), + )); + let g_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, g_dup_lo), + _mm256_add_epi32(y_hi_scaled, g_dup_hi), + )); + let b_u16 = _mm256_permute4x64_epi64::<0xD8>(_mm256_packus_epi32( + _mm256_add_epi32(y_lo_scaled, b_dup_lo), + _mm256_add_epi32(y_hi_scaled, b_dup_hi), + )); + + // Write 16 pixels via two 8-pixel helpers. + if ALPHA { + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r_u16), + _mm256_castsi256_si128(g_u16), + _mm256_castsi256_si128(b_u16), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r_u16), + _mm256_extracti128_si256::<1>(g_u16), + _mm256_extracti128_si256::<1>(b_u16), + dst.add(24), + ); + } + + x += 16; } - - x += 16; } // Scalar tail — remaining < 16 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -450,62 +462,69 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX2 availability is the caller's obligation. unsafe { - // Per-lane Y permute mask: pick even u16 lanes (low byte first) into - // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed. - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 32 <= width { - // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31. - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); - let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); - - // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane. - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v2s = _mm256_shuffle_epi8(v2, split_idx); - let v3s = _mm256_shuffle_epi8(v3, split_idx); - - // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits. - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); - let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); - - // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16). - let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] - let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] - - // `>> 8` to obtain u8 luma (high byte of each Y u16 sample). - // `_mm256_srli_epi16::<8>` has a literal const count. - let y_lo_shr = _mm256_srli_epi16::<8>(y_lo); - let y_hi_shr = _mm256_srli_epi16::<8>(y_hi); - - // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup. - let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr); - _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 32; + if !BE { + // Per-lane Y permute mask: pick even u16 lanes (low byte first) into + // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed. + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 32 <= width { + // Four 256-bit loads: v0/v1 for pixels x..x+15, v2/v3 for x+16..x+31. + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); + let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); + + // Per-lane shuffle → Y into low 64-bit chunk of each 128-bit lane. + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v2s = _mm256_shuffle_epi8(v2, split_idx); + let v3s = _mm256_shuffle_epi8(v3, split_idx); + + // 0x88 = [0, 2, 0, 2]: pack low 64-bit chunks (lane0 + lane1) into low 128 bits. + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); + let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); + + // Cross-vector merge: lo 128 of v0p + lo 128 of v1p → Y0..Y15 (16 u16). + let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] + let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] + + // `>> 8` to obtain u8 luma (high byte of each Y u16 sample). + // `_mm256_srli_epi16::<8>` has a literal const count. + let y_lo_shr = _mm256_srli_epi16::<8>(y_lo); + let y_hi_shr = _mm256_srli_epi16::<8>(y_hi); + + // Narrow 32 × i16 → 32 × u8. narrow_u8x32 already applies 0xD8 lane fixup. + let y_u8 = narrow_u8x32(y_lo_shr, y_hi_shr); + _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -526,52 +545,59 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX2 availability is the caller's obligation. unsafe { - // Per-lane Y permute mask (same as luma_row above). - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, - -1, -1, -1, -1, -1, -1, - ); - let mut x = 0usize; - while x + 32 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); - let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); - - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v2s = _mm256_shuffle_epi8(v2, split_idx); - let v3s = _mm256_shuffle_epi8(v3, split_idx); - - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); - let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); - - let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] - let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] - - // Direct store — full 16-bit Y values, no shift. - _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo); - _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi); - - x += 32; + if !BE { + // Per-lane Y permute mask (same as luma_row above). + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, + -1, -1, -1, -1, -1, -1, + ); + + while x + 32 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v2 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 32).cast()); + let v3 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 48).cast()); + + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v2s = _mm256_shuffle_epi8(v2, split_idx); + let v3s = _mm256_shuffle_epi8(v3, split_idx); + + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let v2p = _mm256_permute4x64_epi64::<0x88>(v2s); + let v3p = _mm256_permute4x64_epi64::<0x88>(v3s); + + let y_lo = _mm256_permute2x128_si256::<0x20>(v0p, v1p); // [Y0..Y15] + let y_hi = _mm256_permute2x128_si256::<0x20>(v2p, v3p); // [Y16..Y31] + + // Direct store — full 16-bit Y values, no shift. + _mm256_storeu_si256(out.as_mut_ptr().add(x).cast(), y_lo); + _mm256_storeu_si256(out.as_mut_ptr().add(x + 16).cast(), y_hi); + + x += 32; + } } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx2/y2xx.rs b/src/row/arch/x86_avx2/y2xx.rs index 1b9d76f3..bc3c5bb1 100644 --- a/src/row/arch/x86_avx2/y2xx.rs +++ b/src/row/arch/x86_avx2/y2xx.rs @@ -164,7 +164,11 @@ unsafe fn unpack_y2xx_16px_avx2( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -192,122 +196,125 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - // 16-lane chroma vectors with valid data in lanes 0..7. - let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via - // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15. - // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32 - // chroma; we have only 8). - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8 - // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so - // the low 16 bytes hold the saturated u8 of our 16 valid lanes. - let zero = _mm256_setzero_si256(); - let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero); - let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero); - let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero); - - // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit - // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel - // iter. Use the v210-style stack-buffer + scalar interleave - // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.) - let mut r_tmp = [0u8; 32]; - let mut g_tmp = [0u8; 32]; - let mut b_tmp = [0u8; 32]; - _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8); - _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8); - _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 16 * 4]; - for i in 0..16 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 16 * 3]; - for i in 0..16 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm256_srl_epi16` — see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + // Widen 8-valid-lane i16 chroma to two i32x8 halves so the Q15 + // multiplies don't overflow. Only lanes 0..7 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x16` to recycle the helper exactly; the + // don't-care output lanes are discarded by the + // `chroma_dup` step below (which only consumes lanes 0..7 in + // its `lo16` return). + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + // 16-lane chroma vectors with valid data in lanes 0..7. + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `chroma_dup` so lanes 0..15 of `_dup_lo` align with Y0..Y15. + // `_dup_hi` is don't-care (covers Y16..Y31 if input had 32 + // chroma; we have only 8). + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x16. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `narrow_u8x32(lo, hi)` emits 32 u8 + // lanes from 32 i16 lanes; we feed `lo` and zero for `hi` so + // the low 16 bytes hold the saturated u8 of our 16 valid lanes. + let zero = _mm256_setzero_si256(); + let r_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, r_dup_lo), zero); + let g_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, g_dup_lo), zero); + let b_u8 = narrow_u8x32(_mm256_adds_epi16(y_scaled, b_dup_lo), zero); + + // 16-pixel partial store: `write_rgb_32` / `write_rgba_32` emit + // 32-pixel output (96 / 128 bytes) — too wide for our 16-pixel + // iter. Use the v210-style stack-buffer + scalar interleave + // pattern. (16 px × 3 = 48 bytes RGB, 16 px × 4 = 64 bytes RGBA.) + let mut r_tmp = [0u8; 32]; + let mut g_tmp = [0u8; 32]; + let mut b_tmp = [0u8; 32]; + _mm256_storeu_si256(r_tmp.as_mut_ptr().cast(), r_u8); + _mm256_storeu_si256(g_tmp.as_mut_ptr().cast(), g_u8); + _mm256_storeu_si256(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 16 * 4]; + for i in 0..16 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 16 * 3]; + for i in 0..16 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 16; + x += 16; + } } // Scalar tail — remaining < 16 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -334,7 +341,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -360,112 +371,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); - let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm256_add_epi32( - _mm256_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2 - // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`. - let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); - let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); - let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); - - // 16-pixel u16 store: split each i16x16 channel into two - // 128-bit halves and use the SSE4.1 u16 interleave helpers - // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as - // the AVX2 high-bit YUV planar u16 path. - if ALPHA { - let alpha_u16 = _mm_set1_epi16(out_max); - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_u16_8( - _mm256_castsi256_si128(r), - _mm256_castsi256_si128(g), - _mm256_castsi256_si128(b), - alpha_u16, - dst, - ); - write_rgba_u16_8( - _mm256_extracti128_si256::<1>(r), - _mm256_extracti128_si256::<1>(g), - _mm256_extracti128_si256::<1>(b), - alpha_u16, - dst.add(32), - ); - } else { - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_u16_8( - _mm256_castsi256_si128(r), - _mm256_castsi256_si128(g), - _mm256_castsi256_si128(b), - dst, - ); - write_rgb_u16_8( - _mm256_extracti128_si256::<1>(r), - _mm256_extracti128_si256::<1>(g), - _mm256_extracti128_si256::<1>(b), - dst.add(24), - ); - } + if !BE { + let rnd_v = _mm256_set1_epi32(RND); + let y_off_v = _mm256_set1_epi16(y_off as i16); + let y_scale_v = _mm256_set1_epi32(y_scale); + let c_scale_v = _mm256_set1_epi32(c_scale); + let bias_v = _mm256_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm256_set1_epi16(out_max); + let zero_v = _mm256_set1_epi16(0); + let cru = _mm256_set1_epi32(coeffs.r_u()); + let crv = _mm256_set1_epi32(coeffs.r_v()); + let cgu = _mm256_set1_epi32(coeffs.g_u()); + let cgv = _mm256_set1_epi32(coeffs.g_v()); + let cbu = _mm256_set1_epi32(coeffs.b_u()); + let cbv = _mm256_set1_epi32(coeffs.b_v()); + + while x + 16 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_16px_avx2(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm256_sub_epi16(u_vec, bias_v); + let v_i16 = _mm256_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(u_i16)); + let u_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(u_i16)); + let v_lo_i32 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_i16)); + let v_hi_i32 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm256_add_epi32( + _mm256_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x16(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x16(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x16(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. The AVX2 + // `clamp_u16_max_x16` mirrors SSE4.1's `clamp_u16_max`. + let r = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); + let g = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); + let b = clamp_u16_max_x16(_mm256_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); + + // 16-pixel u16 store: split each i16x16 channel into two + // 128-bit halves and use the SSE4.1 u16 interleave helpers + // (`write_rgb_u16_8` / `write_rgba_u16_8`) — same pattern as + // the AVX2 high-bit YUV planar u16 path. + if ALPHA { + let alpha_u16 = _mm_set1_epi16(out_max); + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm256_castsi256_si128(r), + _mm256_castsi256_si128(g), + _mm256_castsi256_si128(b), + alpha_u16, + dst, + ); + write_rgba_u16_8( + _mm256_extracti128_si256::<1>(r), + _mm256_extracti128_si256::<1>(g), + _mm256_extracti128_si256::<1>(b), + alpha_u16, + dst.add(32), + ); + } else { + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_u16_8( + _mm256_castsi256_si128(r), + _mm256_castsi256_si128(g), + _mm256_castsi256_si128(b), + dst, + ); + write_rgb_u16_8( + _mm256_extracti128_si256::<1>(r), + _mm256_extracti128_si256::<1>(g), + _mm256_extracti128_si256::<1>(b), + dst.add(24), + ); + } - x += 16; + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -491,7 +504,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -508,50 +521,52 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Per-lane Y permute mask: pick even u16 lanes (low byte at [0], - // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed. - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 16 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y - // values in its two lanes' low 64 bits. Pack lane0_low and - // lane1_low into the low 128 bits of each vector via - // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]). - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes). - // Low 128 of v1p = [Y8..Y15]. - // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low). - let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a - // literal const count, so it works without runtime-count helper. - let y_shr = _mm256_srli_epi16::<8>(y_vec); - // Pack 16 i16 lanes to u8 — only low 16 bytes used. - let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); - // Store low 16 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 32]; - _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 16].copy_from_slice(&tmp[..16]); - - x += 16; + if !BE { + // Per-lane Y permute mask: pick even u16 lanes (low byte at [0], + // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed. + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 16 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + // After per-lane shuffle: each 256-bit vector has 8 valid u16 Y + // values in its two lanes' low 64 bits. Pack lane0_low and + // lane1_low into the low 128 bits of each vector via + // `_mm256_permute4x64_epi64::<0x88>` (= [0, 2, 0, 2]). + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + // Low 128 of v0p = [Y0..Y7] (8 u16 = 16 bytes). + // Low 128 of v1p = [Y8..Y15]. + // Combine via `_mm256_permute2x128_si256::<0x20>` (low | low). + let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>`. `_mm256_srli_epi16::<8>` has a + // literal const count, so it works without runtime-count helper. + let y_shr = _mm256_srli_epi16::<8>(y_vec); + // Pack 16 i16 lanes to u8 — only low 16 bytes used. + let y_u8 = narrow_u8x32(y_shr, _mm256_setzero_si256()); + // Store low 16 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 32]; + _mm256_storeu_si256(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 16].copy_from_slice(&tmp[..16]); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -569,7 +584,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -586,33 +601,35 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let split_idx = _mm256_setr_epi8( - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane - 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane - ); - let mut x = 0usize; - while x + 16 <= width { - let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); - let v0s = _mm256_shuffle_epi8(v0, split_idx); - let v1s = _mm256_shuffle_epi8(v1, split_idx); - let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); - let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); - let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = _mm256_srl_epi16(y_vec, shr_count); - _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 16; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let split_idx = _mm256_setr_epi8( + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane + 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // high lane + ); + + while x + 16 <= width { + let v0 = _mm256_loadu_si256(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm256_loadu_si256(packed.as_ptr().add(x * 2 + 16).cast()); + let v0s = _mm256_shuffle_epi8(v0, split_idx); + let v1s = _mm256_shuffle_epi8(v1, split_idx); + let v0p = _mm256_permute4x64_epi64::<0x88>(v0s); + let v1p = _mm256_permute4x64_epi64::<0x88>(v1s); + let y_vec = _mm256_permute2x128_si256::<0x20>(v0p, v1p); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = _mm256_srl_epi16(y_vec, shr_count); + _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/tests/v210.rs b/src/row/arch/x86_avx512/tests/v210.rs index 0abf4bae..f2652cb8 100644 --- a/src/row/arch/x86_avx512/tests/v210.rs +++ b/src/row/arch/x86_avx512/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v210→luma u16 diverges (width={width})"); } @@ -250,7 +250,7 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx512 v210 luma reorder bug"); @@ -259,9 +259,15 @@ fn avx512_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx512/tests/y216.rs b/src/row/arch/x86_avx512/tests/y216.rs index 93fa76cc..ae8b2bc7 100644 --- a/src/row/arch/x86_avx512/tests/y216.rs +++ b/src/row/arch/x86_avx512/tests/y216.rs @@ -16,9 +16,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -33,9 +33,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -49,9 +49,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y216→luma u8 diverges (width={width})"); } @@ -60,9 +60,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y216→luma u16 diverges (width={width})"); } @@ -178,7 +178,7 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "AVX-512 y216 luma_u16 reorder bug"); @@ -187,9 +187,15 @@ fn avx512_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_avx512/tests/y2xx.rs b/src/row/arch/x86_avx512/tests/y2xx.rs index fd5ccbad..dc609f8d 100644 --- a/src/row/arch/x86_avx512/tests/y2xx.rs +++ b/src/row/arch/x86_avx512/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -111,9 +111,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -125,9 +125,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -139,9 +139,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -153,9 +155,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -167,9 +171,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -178,9 +182,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -278,15 +282,15 @@ fn avx512_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "AVX-512 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -294,7 +298,7 @@ fn avx512_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -309,17 +313,17 @@ fn avx512_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "AVX-512 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "AVX-512 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_avx512/v210.rs b/src/row/arch/x86_avx512/v210.rs index e5a77eb4..8c68f16d 100644 --- a/src/row/arch/x86_avx512/v210.rs +++ b/src/row/arch/x86_avx512/v210.rs @@ -40,7 +40,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x16, *}; use crate::{ColorMatrix, row::scalar}; // ---- Static permute index tables -------------------------------------- @@ -187,11 +187,11 @@ static V_FROM_MID: [i16; 32] = [ /// `permutexvar` op `vpermw`). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i) { +unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512i) { // SAFETY: caller obligation — `ptr` has 64 bytes readable; AVX-512F // + AVX-512BW are available. unsafe { - let words = _mm512_loadu_si512(ptr.cast()); + let words = load_endian_u32x16::(ptr); let mask10 = _mm512_set1_epi32(0x3FF); let low10 = _mm512_and_si512(words, mask10); let mid10 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask10); @@ -247,7 +247,7 @@ unsafe fn unpack_v210_4words_avx512(ptr: *const u8) -> (__m512i, __m512i, __m512 /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -290,7 +290,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( // Main loop: 24 pixels (4 v210 words = 64 bytes) per iteration. let quads = words / 4; for q in 0..quads { - let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); let y_i16 = y_vec; @@ -392,7 +392,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -409,7 +415,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -451,7 +457,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let quads = words / 4; for q in 0..quads { - let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, u_vec, v_vec) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); let y_i16 = y_vec; let u_i16 = _mm512_sub_epi16(u_vec, bias_v); @@ -529,7 +535,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -552,7 +558,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -566,7 +576,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let quads = words / 4; for q in 0..quads { - let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, _, _) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x64 via packus // (only first 32 lanes carry data, paired with a zero hi half; // first 24 bytes of the result are valid Y0..Y23). @@ -585,7 +595,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -602,7 +612,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -613,7 +627,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w unsafe { let quads = words / 4; for q in 0..quads { - let (y_vec, _, _) = unpack_v210_4words_avx512(packed.as_ptr().add(q * 64)); + let (y_vec, _, _) = unpack_v210_4words_avx512::(packed.as_ptr().add(q * 64)); // Store first 24 of the 32 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 32]; _mm512_storeu_si512(tmp.as_mut_ptr().cast(), y_vec); @@ -627,7 +641,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[quads * 64..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/y216.rs b/src/row/arch/x86_avx512/y216.rs index 2a60b3b3..be564433 100644 --- a/src/row/arch/x86_avx512/y216.rs +++ b/src/row/arch/x86_avx512/y216.rs @@ -118,7 +118,7 @@ unsafe fn unpack_y216_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -137,150 +137,160 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let rnd_v = _mm512_set1_epi32(RND); - let y_off_v = _mm512_set1_epi32(y_off); - let y_scale_v = _mm512_set1_epi32(y_scale); - let c_scale_v = _mm512_set1_epi32(c_scale); - // Chroma bias: 32768 via wrapping -32768 i16. - let bias16_v = _mm512_set1_epi16(-32768i16); - let cru = _mm512_set1_epi32(coeffs.r_u()); - let crv = _mm512_set1_epi32(coeffs.r_v()); - let cgu = _mm512_set1_epi32(coeffs.g_u()); - let cgv = _mm512_set1_epi32(coeffs.g_v()); - let cbu = _mm512_set1_epi32(coeffs.b_u()); - let cbv = _mm512_set1_epi32(coeffs.b_v()); - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); - let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); - let mut x = 0usize; - while x + 64 <= width { - // --- lo group: pixels x..x+31 (32 pixels) -------------------------- - let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); - - let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); - let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v); - - // Widen 16 valid U/V i16 lanes to two i32x16 halves. - let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16)); - let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16)); - let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16)); - let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16)); - - let u_d_lo_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_a, c_scale_v), - rnd_v, - )); - let u_d_lo_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_b, c_scale_v), - rnd_v, - )); - let v_d_lo_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_a, c_scale_v), - rnd_v, - )); - let v_d_lo_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_b, c_scale_v), - rnd_v, - )); - - // chroma_i16x32: 32-lane vector, valid data in lanes 0..16. - let r_chroma_lo = chroma_i16x32( - cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - let g_chroma_lo = chroma_i16x32( - cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - let b_chroma_lo = chroma_i16x32( - cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, - ); - - // Duplicate each chroma sample into its 4:2:2 Y-pair slot. - // 16 valid chroma → lo32 covers all 32 Y lanes. - let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx); - - // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // --- hi group: pixels x+32..x+63 (32 pixels) ---------------------- - let (y_hi_vec, u_hi_vec, v_hi_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64)); - - let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); - let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v); - - let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16)); - let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16)); - let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16)); - let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16)); - - let u_d_hi_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_a, c_scale_v), - rnd_v, - )); - let u_d_hi_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_b, c_scale_v), - rnd_v, - )); - let v_d_hi_a = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_a, c_scale_v), - rnd_v, - )); - let v_d_hi_b = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_b, c_scale_v), - rnd_v, - )); - - let r_chroma_hi = chroma_i16x32( - cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - let g_chroma_hi = chroma_i16x32( - cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - let b_chroma_hi = chroma_i16x32( - cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, - ); - - let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx); - let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx); - let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx); - - let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Saturating i16 add + narrow to u8x64 per channel. - let r_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, r_dup_lo), - _mm512_adds_epi16(y_hi_scaled, r_dup_hi), - pack_fixup, - ); - let g_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, g_dup_lo), - _mm512_adds_epi16(y_hi_scaled, g_dup_hi), - pack_fixup, - ); - let b_u8 = narrow_u8x64( - _mm512_adds_epi16(y_lo_scaled, b_dup_lo), - _mm512_adds_epi16(y_hi_scaled, b_dup_hi), - pack_fixup, - ); - - if ALPHA { - let alpha = _mm512_set1_epi8(-1); - write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi32(y_off); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + // Chroma bias: 32768 via wrapping -32768 i16. + let bias16_v = _mm512_set1_epi16(-32768i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 64 <= width { + // --- lo group: pixels x..x+31 (32 pixels) ------------------------ + let (y_lo_vec, u_lo_vec, v_lo_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); + + let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); + let v_lo_i16 = _mm512_sub_epi16(v_lo_vec, bias16_v); + + // Widen 16 valid U/V i16 lanes to two i32x16 halves. + let u_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_lo_i16)); + let u_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_lo_i16)); + let v_lo_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_lo_i16)); + let v_lo_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_lo_i16)); + + let u_d_lo_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_a, c_scale_v), + rnd_v, + )); + let u_d_lo_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_b, c_scale_v), + rnd_v, + )); + let v_d_lo_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_a, c_scale_v), + rnd_v, + )); + let v_d_lo_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_b, c_scale_v), + rnd_v, + )); + + // chroma_i16x32: 32-lane vector, valid data in lanes 0..16. + let r_chroma_lo = chroma_i16x32( + cru, crv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + let g_chroma_lo = chroma_i16x32( + cgu, cgv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + let b_chroma_lo = chroma_i16x32( + cbu, cbv, u_d_lo_a, v_d_lo_a, u_d_lo_b, v_d_lo_b, rnd_v, pack_fixup, + ); + + // Duplicate each chroma sample into its 4:2:2 Y-pair slot. + // 16 valid chroma → lo32 covers all 32 Y lanes. + let (r_dup_lo, _) = chroma_dup(r_chroma_lo, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _) = chroma_dup(g_chroma_lo, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _) = chroma_dup(b_chroma_lo, dup_lo_idx, dup_hi_idx); + + // scale_y_u16_avx512: unsigned-widens Y to avoid i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16_avx512(y_lo_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // --- hi group: pixels x+32..x+63 (32 pixels) ---------------------- + let (y_hi_vec, u_hi_vec, v_hi_vec) = + unpack_y216_32px_avx512(packed.as_ptr().add(x * 2 + 64)); + + let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); + let v_hi_i16 = _mm512_sub_epi16(v_hi_vec, bias16_v); + + let u_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_hi_i16)); + let u_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_hi_i16)); + let v_hi_a = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_hi_i16)); + let v_hi_b = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_hi_i16)); + + let u_d_hi_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_a, c_scale_v), + rnd_v, + )); + let u_d_hi_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_b, c_scale_v), + rnd_v, + )); + let v_d_hi_a = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_a, c_scale_v), + rnd_v, + )); + let v_d_hi_b = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_b, c_scale_v), + rnd_v, + )); + + let r_chroma_hi = chroma_i16x32( + cru, crv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + let g_chroma_hi = chroma_i16x32( + cgu, cgv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + let b_chroma_hi = chroma_i16x32( + cbu, cbv, u_d_hi_a, v_d_hi_a, u_d_hi_b, v_d_hi_b, rnd_v, pack_fixup, + ); + + let (r_dup_hi, _) = chroma_dup(r_chroma_hi, dup_lo_idx, dup_hi_idx); + let (g_dup_hi, _) = chroma_dup(g_chroma_hi, dup_lo_idx, dup_hi_idx); + let (b_dup_hi, _) = chroma_dup(b_chroma_hi, dup_lo_idx, dup_hi_idx); + + let y_hi_scaled = scale_y_u16_avx512(y_hi_vec, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Saturating i16 add + narrow to u8x64 per channel. + let r_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, r_dup_lo), + _mm512_adds_epi16(y_hi_scaled, r_dup_hi), + pack_fixup, + ); + let g_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, g_dup_lo), + _mm512_adds_epi16(y_hi_scaled, g_dup_hi), + pack_fixup, + ); + let b_u8 = narrow_u8x64( + _mm512_adds_epi16(y_lo_scaled, b_dup_lo), + _mm512_adds_epi16(y_hi_scaled, b_dup_hi), + pack_fixup, + ); + + if ALPHA { + let alpha = _mm512_set1_epi8(-1); + write_rgba_64(r_u8, g_u8, b_u8, alpha, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_64(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + + x += 64; } - - x += 64; } // Scalar tail — remaining < 64 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -301,7 +311,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -320,125 +330,130 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_i64_v = _mm512_set1_epi64(RND_I64); - let rnd_i32_v = _mm512_set1_epi32(RND_I32); - let y_off_v = _mm512_set1_epi32(y_off); - let y_scale_v = _mm512_set1_epi32(y_scale); - let c_scale_v = _mm512_set1_epi32(c_scale); - let bias16_v = _mm512_set1_epi16(-32768i16); - let cru = _mm512_set1_epi32(coeffs.r_u()); - let crv = _mm512_set1_epi32(coeffs.r_v()); - let cgu = _mm512_set1_epi32(coeffs.g_u()); - let cgv = _mm512_set1_epi32(coeffs.g_v()); - let cbu = _mm512_set1_epi32(coeffs.b_u()); - let cbv = _mm512_set1_epi32(coeffs.b_v()); - - // Permute indices built once. - // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots. - let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); - let dup_hi_idx = _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15); - // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...]. - let interleave_idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let mut x = 0usize; - while x + 32 <= width { - // One deinterleave gives 32 Y + 16 UV pairs. - let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); - - // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16). - let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); - let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); - - // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale. - // High 256 bits of u_vec / v_vec hold don't-care values after the - // U/V split permute; they won't reach chroma_i64x8_avx512. - let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); - let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - - // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 → - // product ≤ ~1.26·10⁹ — fits i32. - let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( - _mm512_mullo_epi32(u_i32, c_scale_v), - rnd_i32_v, - )); - let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( - _mm512_mullo_epi32(v_i32, c_scale_v), - rnd_i32_v, - )); - - // i64 chroma: even and odd i32 lanes separately. - let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d); - let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v); - let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v); - let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v); - let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v); - let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v); - let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v); - - // Reassemble i64x8 pairs → i32x16 [c0..c15]. - let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx); - let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx); - let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx); - - // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling). - let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32); - let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32); - let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32); - let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32); - let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32); - let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32); - - // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64. - let y_lo_u16 = _mm512_castsi512_si256(y_vec); - let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec); - let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v); - let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v); - - let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx); - let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx); - - // Y + chroma → pack with unsigned saturation to u16x32. - let r_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, r_dup_lo), - _mm512_add_epi32(y_hi_scaled, r_dup_hi), - ), - ); - let g_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, g_dup_lo), - _mm512_add_epi32(y_hi_scaled, g_dup_hi), - ), - ); - let b_u16 = _mm512_permutexvar_epi64( - pack_fixup, - _mm512_packus_epi32( - _mm512_add_epi32(y_lo_scaled, b_dup_lo), - _mm512_add_epi32(y_hi_scaled, b_dup_hi), - ), - ); - - if ALPHA { - write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_i64_v = _mm512_set1_epi64(RND_I64); + let rnd_i32_v = _mm512_set1_epi32(RND_I32); + let y_off_v = _mm512_set1_epi32(y_off); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias16_v = _mm512_set1_epi16(-32768i16); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + // Permute indices built once. + // dup_{lo,hi}_idx: duplicate 16 chroma i32 lanes into 32 slots. + let dup_lo_idx = _mm512_setr_epi32(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7); + let dup_hi_idx = + _mm512_setr_epi32(8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15); + // interleave_idx: even i32x8 + odd i32x8 → i32x16 [e0,o0,e1,o1,...]. + let interleave_idx = + _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + + while x + 32 <= width { + // One deinterleave gives 32 Y + 16 UV pairs. + let (y_vec, u_vec, v_vec) = unpack_y216_32px_avx512(packed.as_ptr().add(x * 2)); + + // Subtract chroma bias (wrapping i16 sub of -32768 = +32768 mod 2^16). + let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); + + // Widen 16 valid i16 lanes (low 256 bits) to i32x16 for Q15 scale. + // High 256 bits of u_vec / v_vec hold don't-care values after the + // U/V split permute; they won't reach chroma_i64x8_avx512. + let u_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let v_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + + // Scale UV in i32: |u_centered| ≤ 32768, |c_scale| ≤ ~38300 → + // product ≤ ~1.26·10⁹ — fits i32. + let u_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( + _mm512_mullo_epi32(u_i32, c_scale_v), + rnd_i32_v, + )); + let v_d = _mm512_srai_epi32::<15>(_mm512_add_epi32( + _mm512_mullo_epi32(v_i32, c_scale_v), + rnd_i32_v, + )); + + // i64 chroma: even and odd i32 lanes separately. + let u_d_odd = _mm512_shuffle_epi32::<0xF5>(u_d); + let v_d_odd = _mm512_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x8_avx512(cru, crv, u_d, v_d, rnd_i64_v); + let r_ch_odd = chroma_i64x8_avx512(cru, crv, u_d_odd, v_d_odd, rnd_i64_v); + let g_ch_even = chroma_i64x8_avx512(cgu, cgv, u_d, v_d, rnd_i64_v); + let g_ch_odd = chroma_i64x8_avx512(cgu, cgv, u_d_odd, v_d_odd, rnd_i64_v); + let b_ch_even = chroma_i64x8_avx512(cbu, cbv, u_d, v_d, rnd_i64_v); + let b_ch_odd = chroma_i64x8_avx512(cbu, cbv, u_d_odd, v_d_odd, rnd_i64_v); + + // Reassemble i64x8 pairs → i32x16 [c0..c15]. + let r_ch_i32 = reassemble_i32x16(r_ch_even, r_ch_odd, interleave_idx); + let g_ch_i32 = reassemble_i32x16(g_ch_even, g_ch_odd, interleave_idx); + let b_ch_i32 = reassemble_i32x16(b_ch_even, b_ch_odd, interleave_idx); + + // Duplicate 16 chroma values → 32 slots (4:2:2 upsampling). + let r_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, r_ch_i32); + let r_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, r_ch_i32); + let g_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, g_ch_i32); + let g_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, g_ch_i32); + let b_dup_lo = _mm512_permutexvar_epi32(dup_lo_idx, b_ch_i32); + let b_dup_hi = _mm512_permutexvar_epi32(dup_hi_idx, b_ch_i32); + + // Y: unsigned-widen 32 u16 → two i32x16 halves, subtract y_off, scale i64. + let y_lo_u16 = _mm512_castsi512_si256(y_vec); + let y_hi_u16 = _mm512_extracti64x4_epi64::<1>(y_vec); + let y_lo_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_lo_u16), y_off_v); + let y_hi_i32 = _mm512_sub_epi32(_mm512_cvtepu16_epi32(y_hi_u16), y_off_v); + + let y_lo_scaled = scale_y_i32x16_i64(y_lo_i32, y_scale_v, rnd_i64_v, interleave_idx); + let y_hi_scaled = scale_y_i32x16_i64(y_hi_i32, y_scale_v, rnd_i64_v, interleave_idx); + + // Y + chroma → pack with unsigned saturation to u16x32. + let r_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, r_dup_lo), + _mm512_add_epi32(y_hi_scaled, r_dup_hi), + ), + ); + let g_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, g_dup_lo), + _mm512_add_epi32(y_hi_scaled, g_dup_hi), + ), + ); + let b_u16 = _mm512_permutexvar_epi64( + pack_fixup, + _mm512_packus_epi32( + _mm512_add_epi32(y_lo_scaled, b_dup_lo), + _mm512_add_epi32(y_hi_scaled, b_dup_hi), + ), + ); + + if ALPHA { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + + x += 32; } - - x += 32; } // Scalar tail — remaining < 32 pixels. + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -464,43 +479,49 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 64 <= width { - // lo group: pixels x..x+31 - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); - let y_lo_shr = _mm512_srli_epi16::<8>(y_lo); - - // hi group: pixels x+32..x+63 - let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); - let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); - let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); - let y_hi_shr = _mm512_srli_epi16::<8>(y_hi); - - // Pack 64 × i16 → 64 × u8 with natural order. - let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup); - // Store all 64 bytes at once. - _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 64; + if !BE { + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 64 <= width { + // lo group: pixels x..x+31 + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); + let y_lo_shr = _mm512_srli_epi16::<8>(y_lo); + + // hi group: pixels x+32..x+63 + let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); + let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); + let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); + let y_hi_shr = _mm512_srli_epi16::<8>(y_hi); + + // Pack 64 × i16 → 64 × u8 with natural order. + let y_u8 = narrow_u8x64(y_lo_shr, y_hi_shr, pack_fixup); + // Store all 64 bytes at once. + _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 64; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -520,39 +541,45 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 64 <= width { - // lo group: pixels x..x+31 - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); - - // hi group: pixels x+32..x+63 - let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); - let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); - let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); - - // Direct store — full 16-bit Y values, no shift. - _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo); - _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi); - - x += 64; + if !BE { + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 64 <= width { + // lo group: pixels x..x+31 + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_lo = _mm512_permutex2var_epi16(v0, y_idx, v1); + + // hi group: pixels x+32..x+63 + let v2 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 64).cast()); + let v3 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 96).cast()); + let y_hi = _mm512_permutex2var_epi16(v2, y_idx, v3); + + // Direct store — full 16-bit Y values, no shift. + _mm512_storeu_si512(out.as_mut_ptr().add(x).cast(), y_lo); + _mm512_storeu_si512(out.as_mut_ptr().add(x + 32).cast(), y_hi); + + x += 64; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_avx512/y2xx.rs b/src/row/arch/x86_avx512/y2xx.rs index 4944cc6d..1d2b1dcd 100644 --- a/src/row/arch/x86_avx512/y2xx.rs +++ b/src/row/arch/x86_avx512/y2xx.rs @@ -177,7 +177,11 @@ unsafe fn unpack_y2xx_32px_avx512( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -205,132 +209,135 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - // i16x32 chroma vectors with valid data in lanes 0..16. - let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - - // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup` - // duplicates each of 32 chroma lanes into its pair slot, - // splitting across two i16x32 vectors. With 16 valid chroma in - // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1, - // ..., c15,c15]); `hi32` is don't-care. - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Per-channel saturating add (i16x32). All 32 lanes valid. - let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo); - let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo); - let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo); - - // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)` - // packs 32 i16 lanes of `lo` to u8 in the result's first 32 - // bytes (next 32 zero, after the lane-fixup permute). - let zero = _mm512_setzero_si512(); - let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup); - let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup); - let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup); - - // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls - // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32` - // pulls the two valid 128-bit halves out of the u8x64 result. - if ALPHA { - let alpha = _mm_set1_epi8(-1); - let r0 = _mm512_castsi512_si128(r_u8); - let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); - let g0 = _mm512_castsi512_si128(g_u8); - let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); - let b0 = _mm512_castsi512_si128(b_u8); - let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); - let dst = out.as_mut_ptr().add(x * 4); - write_rgba_16(r0, g0, b0, alpha, dst); - write_rgba_16(r1, g1, b1, alpha, dst.add(64)); - } else { - let r0 = _mm512_castsi512_si128(r_u8); - let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); - let g0 = _mm512_castsi512_si128(g_u8); - let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); - let b0 = _mm512_castsi512_si128(b_u8); - let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); - let dst = out.as_mut_ptr().add(x * 3); - write_rgb_16(r0, g0, b0, dst); - write_rgb_16(r1, g1, b1, dst.add(48)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm512_srl_epi16` — see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 32 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. Only lanes 0..16 + // carry valid samples; the bias subtraction on don't-care lanes + // is harmless since they're discarded by `chroma_dup`'s `hi32`. + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + // Widen 16-valid-lane i16 chroma to two i32x16 halves so the + // Q15 multiplies don't overflow. Only lanes 0..16 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x32` to recycle the helper exactly; the + // don't-care output lanes 16..32 are discarded by `chroma_dup`'s + // `hi32` return below (which only consumes lanes 0..16 in its + // `lo32` return). + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + // i16x32 chroma vectors with valid data in lanes 0..16. + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + // Each chroma sample covers 2 Y lanes (4:2:2). `chroma_dup` + // duplicates each of 32 chroma lanes into its pair slot, + // splitting across two i16x32 vectors. With 16 valid chroma in + // lanes 0..16, `lo32` lanes 0..32 are valid (= [c0,c0, c1,c1, + // ..., c15,c15]); `hi32` is don't-care. + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x32. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Per-channel saturating add (i16x32). All 32 lanes valid. + let r_sum = _mm512_adds_epi16(y_scaled, r_dup_lo); + let g_sum = _mm512_adds_epi16(y_scaled, g_dup_lo); + let b_sum = _mm512_adds_epi16(y_scaled, b_dup_lo); + + // u8 narrow with saturation. `narrow_u8x64(lo, zero, pack_fixup)` + // packs 32 i16 lanes of `lo` to u8 in the result's first 32 + // bytes (next 32 zero, after the lane-fixup permute). + let zero = _mm512_setzero_si512(); + let r_u8 = narrow_u8x64(r_sum, zero, pack_fixup); + let g_u8 = narrow_u8x64(g_sum, zero, pack_fixup); + let b_u8 = narrow_u8x64(b_sum, zero, pack_fixup); + + // 32-pixel store via two `write_rgb_16` / `write_rgba_16` calls + // (each writes 16 px = 48 / 64 bytes). `_mm512_extracti32x4_epi32` + // pulls the two valid 128-bit halves out of the u8x64 result. + if ALPHA { + let alpha = _mm_set1_epi8(-1); + let r0 = _mm512_castsi512_si128(r_u8); + let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); + let g0 = _mm512_castsi512_si128(g_u8); + let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); + let b0 = _mm512_castsi512_si128(b_u8); + let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_16(r0, g0, b0, alpha, dst); + write_rgba_16(r1, g1, b1, alpha, dst.add(64)); + } else { + let r0 = _mm512_castsi512_si128(r_u8); + let r1 = _mm512_extracti32x4_epi32::<1>(r_u8); + let g0 = _mm512_castsi512_si128(g_u8); + let g1 = _mm512_extracti32x4_epi32::<1>(g_u8); + let b0 = _mm512_castsi512_si128(b_u8); + let b1 = _mm512_extracti32x4_epi32::<1>(b_u8); + let dst = out.as_mut_ptr().add(x * 3); + write_rgb_16(r0, g0, b0, dst); + write_rgb_16(r1, g1, b1, dst.add(48)); + } + + x += 32; } - - x += 32; } // Scalar tail — remaining < 32 pixels (always even per 4:2:2). + // When BE=true the full row is covered here. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -357,7 +364,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -383,86 +394,88 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); - let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); - - let u_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_lo_i32, c_scale_v), - rnd_v, - )); - let u_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(u_hi_i32, c_scale_v), - rnd_v, - )); - let v_d_lo = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_lo_i32, c_scale_v), - rnd_v, - )); - let v_d_hi = q15_shift(_mm512_add_epi32( - _mm512_mullo_epi32(v_hi_i32, c_scale_v), - rnd_v, - )); - - let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); - - let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); - let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); - let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. - let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); - let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); - let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); - - // 32-pixel u16 store via the shared 32-pixel writers. - if ALPHA { - let alpha_u16 = _mm_set1_epi16(out_max); - write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm512_set1_epi32(RND); + let y_off_v = _mm512_set1_epi16(y_off as i16); + let y_scale_v = _mm512_set1_epi32(y_scale); + let c_scale_v = _mm512_set1_epi32(c_scale); + let bias_v = _mm512_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm512_set1_epi16(out_max); + let zero_v = _mm512_set1_epi16(0); + let cru = _mm512_set1_epi32(coeffs.r_u()); + let crv = _mm512_set1_epi32(coeffs.r_v()); + let cgu = _mm512_set1_epi32(coeffs.g_u()); + let cgv = _mm512_set1_epi32(coeffs.g_v()); + let cbu = _mm512_set1_epi32(coeffs.b_u()); + let cbv = _mm512_set1_epi32(coeffs.b_v()); + + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let dup_lo_idx = _mm512_setr_epi64(0, 1, 8, 9, 2, 3, 10, 11); + let dup_hi_idx = _mm512_setr_epi64(4, 5, 12, 13, 6, 7, 14, 15); + + while x + 32 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_32px_avx512(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm512_sub_epi16(u_vec, bias_v); + let v_i16 = _mm512_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(u_i16)); + let u_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(u_i16)); + let v_lo_i32 = _mm512_cvtepi16_epi32(_mm512_castsi512_si256(v_i16)); + let v_hi_i32 = _mm512_cvtepi16_epi32(_mm512_extracti64x4_epi64::<1>(v_i16)); + + let u_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_lo_i32, c_scale_v), + rnd_v, + )); + let u_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(u_hi_i32, c_scale_v), + rnd_v, + )); + let v_d_lo = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_lo_i32, c_scale_v), + rnd_v, + )); + let v_d_hi = q15_shift(_mm512_add_epi32( + _mm512_mullo_epi32(v_hi_i32, c_scale_v), + rnd_v, + )); + + let r_chroma = chroma_i16x32(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let g_chroma = chroma_i16x32(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + let b_chroma = chroma_i16x32(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v, pack_fixup); + + let (r_dup_lo, _r_dup_hi) = chroma_dup(r_chroma, dup_lo_idx, dup_hi_idx); + let (g_dup_lo, _g_dup_hi) = chroma_dup(g_chroma, dup_lo_idx, dup_hi_idx); + let (b_dup_lo, _b_dup_hi) = chroma_dup(b_chroma, dup_lo_idx, dup_hi_idx); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v, pack_fixup); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. + let r = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, r_dup_lo), zero_v, max_v); + let g = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, g_dup_lo), zero_v, max_v); + let b = clamp_u16_max_x32(_mm512_adds_epi16(y_scaled, b_dup_lo), zero_v, max_v); + + // 32-pixel u16 store via the shared 32-pixel writers. + if ALPHA { + let alpha_u16 = _mm_set1_epi16(out_max); + write_rgba_u16_32(r, g, b, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_32(r, g, b, out.as_mut_ptr().add(x * 3)); + } + + x += 32; } - - x += 32; } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -488,7 +501,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -505,38 +518,40 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); - let zero = _mm512_setzero_si512(); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 32 <= width { - // Load 64 u16 = 32 pixels and pull just the Y lanes via the - // cross-vector u16 permute. We don't need chroma here. - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const - // count, so it works without runtime-count helper. - let y_shr = _mm512_srli_epi16::<8>(y_raw); - // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack - // fixup); next 32 zero from the zero-hi pack source. - let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup); - // Store first 32 bytes via the low 256-bit half. - _mm256_storeu_si256( - luma_out.as_mut_ptr().add(x).cast(), - _mm512_castsi512_si256(y_u8), - ); - x += 32; + if !BE { + let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); + let zero = _mm512_setzero_si512(); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 32 <= width { + // Load 64 u16 = 32 pixels and pull just the Y lanes via the + // cross-vector u16 permute. We don't need chroma here. + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON / AVX2. `_mm512_srli_epi16::<8>` has a literal const + // count, so it works without runtime-count helper. + let y_shr = _mm512_srli_epi16::<8>(y_raw); + // Pack 32 i16 lanes to u8 — first 32 bytes valid (after pack + // fixup); next 32 zero from the zero-hi pack source. + let y_u8 = narrow_u8x64(y_shr, zero, pack_fixup); + // Store first 32 bytes via the low 256-bit half. + _mm256_storeu_si256( + luma_out.as_mut_ptr().add(x).cast(), + _mm512_castsi512_si256(y_u8), + ); + x += 32; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -554,7 +569,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -571,26 +586,28 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); - let mut x = 0usize; - while x + 32 <= width { - let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); - let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); - let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into - // low-bit-packed form for the native-depth u16 output. - let y_low = _mm512_srl_epi16(y_raw, shr_count); - _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 32; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); + + while x + 32 <= width { + let v0 = _mm512_loadu_si512(packed.as_ptr().add(x * 2).cast()); + let v1 = _mm512_loadu_si512(packed.as_ptr().add(x * 2 + 32).cast()); + let y_raw = _mm512_permutex2var_epi16(v0, y_idx, v1); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples into + // low-bit-packed form for the native-depth u16 output. + let y_low = _mm512_srl_epi16(y_raw, shr_count); + _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 32; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/tests/v210.rs b/src/row/arch/x86_sse41/tests/v210.rs index 6f1b9480..dea42837 100644 --- a/src/row/arch/x86_sse41/tests/v210.rs +++ b/src/row/arch/x86_sse41/tests/v210.rs @@ -26,9 +26,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -40,9 +40,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -54,9 +54,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -68,9 +68,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v210_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -82,9 +82,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v210_to_luma_row(&p, &mut s, width); + scalar::v210_to_luma_row::(&p, &mut s, width); unsafe { - v210_to_luma_row(&p, &mut k, width); + v210_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v210→luma diverges (width={width})"); } @@ -93,9 +93,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v210_words(width.div_ceil(6), 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v210_to_luma_u16_row(&p, &mut s, width); + scalar::v210_to_luma_u16_row::(&p, &mut s, width); unsafe { - v210_to_luma_u16_row(&p, &mut k, width); + v210_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v210→luma u16 diverges (width={width})"); } @@ -234,7 +234,7 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v210_to_luma_u16_row(&packed, &mut luma, W); + v210_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "sse4.1 v210 luma reorder bug"); @@ -243,9 +243,15 @@ fn sse41_v210_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v210_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v210_to_rgb_or_rgba_row::( + scalar::v210_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_sse41/tests/y216.rs b/src/row/arch/x86_sse41/tests/y216.rs index ebe59115..48e7acf8 100644 --- a/src/row/arch/x86_sse41/tests/y216.rs +++ b/src/row/arch/x86_sse41/tests/y216.rs @@ -15,9 +15,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -32,9 +32,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y216_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -48,9 +48,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y216_to_luma_row(&p, &mut s, width); + scalar::y216_to_luma_row::(&p, &mut s, width); unsafe { - y216_to_luma_row(&p, &mut k, width); + y216_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y216→luma diverges (width={width})"); } @@ -59,9 +59,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y216(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y216_to_luma_u16_row(&p, &mut s, width); + scalar::y216_to_luma_u16_row::(&p, &mut s, width); unsafe { - y216_to_luma_u16_row(&p, &mut k, width); + y216_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y216→luma u16 diverges (width={width})"); } @@ -166,7 +166,7 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 let mut luma_u16 = std::vec![0u16; W]; unsafe { - y216_to_luma_u16_row(&packed, &mut luma_u16, W); + y216_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "SSE4.1 y216 luma_u16 reorder bug"); @@ -175,9 +175,15 @@ fn sse41_y216_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/x86_sse41/tests/y2xx.rs b/src/row/arch/x86_sse41/tests/y2xx.rs index 1c97b77c..fe0e5cf7 100644 --- a/src/row/arch/x86_sse41/tests/y2xx.rs +++ b/src/row/arch/x86_sse41/tests/y2xx.rs @@ -33,7 +33,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { // Part 1: luma u16 natural-order (low-bit-packed: active BITS in low bits). let mut luma_u16 = std::vec![0u16; W]; unsafe { - y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); + y2xx_n_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -45,7 +45,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::( + y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut simd_rgb, W, @@ -53,7 +53,7 @@ fn check_y2xx_lane_order_per_pixel_y_and_u() { false, ); } - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -107,9 +107,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: boo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -121,9 +121,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bo let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -135,9 +135,11 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -149,9 +151,11 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range let p = pseudo_random_y210(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -163,9 +167,9 @@ fn check_luma(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::y2xx_n_to_luma_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_row::(&p, &mut k, width); + y2xx_n_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 y2xx<{BITS}>→luma diverges (width={width})"); } @@ -174,9 +178,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_y210(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); + scalar::y2xx_n_to_luma_u16_row::(&p, &mut s, width); unsafe { - y2xx_n_to_luma_u16_row::(&p, &mut k, width); + y2xx_n_to_luma_u16_row::(&p, &mut k, width); } assert_eq!( s, k, @@ -264,15 +268,15 @@ fn sse41_y212_matches_scalar_widths() { let p = pseudo_random_y212(w, 0xAA55); let mut s = std::vec![0u8; w * 3]; let mut k = std::vec![0u8; w * 3]; - scalar::y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut s, w, ColorMatrix::Bt709, false); + scalar::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut s, w, ColorMatrix::Bt709, false); unsafe { - y2xx_n_to_rgb_or_rgba_row::<12, false>(&p, &mut k, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<12, false, false>(&p, &mut k, w, ColorMatrix::Bt709, false); } assert_eq!(s, k, "SSE4.1 y2xx<12>→RGB diverges (width={w})"); let mut s_u16 = std::vec![0u16; w * 4]; let mut k_u16 = std::vec![0u16; w * 4]; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut s_u16, w, @@ -280,7 +284,7 @@ fn sse41_y212_matches_scalar_widths() { true, ); unsafe { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>( + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>( &p, &mut k_u16, w, @@ -295,17 +299,17 @@ fn sse41_y212_matches_scalar_widths() { let mut sl = std::vec![0u8; w]; let mut kl = std::vec![0u8; w]; - scalar::y2xx_n_to_luma_row::<12>(&p, &mut sl, w); + scalar::y2xx_n_to_luma_row::<12, false>(&p, &mut sl, w); unsafe { - y2xx_n_to_luma_row::<12>(&p, &mut kl, w); + y2xx_n_to_luma_row::<12, false>(&p, &mut kl, w); } assert_eq!(sl, kl, "SSE4.1 y2xx<12>→luma diverges (width={w})"); let mut slu = std::vec![0u16; w]; let mut klu = std::vec![0u16; w]; - scalar::y2xx_n_to_luma_u16_row::<12>(&p, &mut slu, w); + scalar::y2xx_n_to_luma_u16_row::<12, false>(&p, &mut slu, w); unsafe { - y2xx_n_to_luma_u16_row::<12>(&p, &mut klu, w); + y2xx_n_to_luma_u16_row::<12, false>(&p, &mut klu, w); } assert_eq!(slu, klu, "SSE4.1 y2xx<12>→luma u16 diverges (width={w})"); } diff --git a/src/row/arch/x86_sse41/v210.rs b/src/row/arch/x86_sse41/v210.rs index cc11438d..eb37f5b8 100644 --- a/src/row/arch/x86_sse41/v210.rs +++ b/src/row/arch/x86_sse41/v210.rs @@ -14,7 +14,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian::load_endian_u32x4, *}; use crate::{ColorMatrix, row::scalar}; /// Unpacks one 16-byte v210 word into three `__m128i` vectors holding @@ -42,11 +42,11 @@ use crate::{ColorMatrix, row::scalar}; /// `_mm_shuffle_epi8`). #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) { +unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) { // SAFETY: caller obligation — `ptr` has 16 bytes readable; SSE4.1 // (and thus SSSE3) is available. unsafe { - let words = _mm_loadu_si128(ptr.cast()); + let words = load_endian_u32x4::(ptr); let mask10 = _mm_set1_epi32(0x3FF); let low10 = _mm_and_si128(words, mask10); let mid10 = _mm_and_si128(_mm_srli_epi32::<10>(words), mask10); @@ -143,7 +143,7 @@ unsafe fn unpack_v210_word_sse41(ptr: *const u8) -> (__m128i, __m128i, __m128i) /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_rgb_or_rgba_row( +pub(crate) unsafe fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -180,7 +180,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let cbv = _mm_set1_epi32(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; @@ -263,7 +263,13 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v210_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -280,7 +286,7 @@ pub(crate) unsafe fn v210_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -317,7 +323,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let cbv = _mm_set1_epi32(coeffs.b_v()); for w in 0..words { - let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, u_vec, v_vec) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); let y_i16 = y_vec; let u_i16 = _mm_sub_epi16(u_vec, bias_v); @@ -383,7 +389,7 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut out[tail_start_px * bpp..width * bpp]; let tail_w = width - tail_start_px; - scalar::v210_to_rgb_u16_or_rgba_u16_row::( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -406,7 +412,11 @@ pub(crate) unsafe fn v210_to_rgb_u16_or_rgba_u16_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -416,7 +426,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); // Downshift 10-bit Y by 2 → 8-bit, narrow to u8x8 via packus. let y_shr = _mm_srli_epi16::<2>(y_vec); let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); @@ -430,7 +440,7 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -447,7 +457,11 @@ pub(crate) unsafe fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); let words = width / 6; @@ -457,7 +471,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w // SAFETY: caller's obligation per the safety contract above. unsafe { for w in 0..words { - let (y_vec, _, _) = unpack_v210_word_sse41(packed.as_ptr().add(w * 16)); + let (y_vec, _, _) = unpack_v210_word_sse41::(packed.as_ptr().add(w * 16)); // Store 6 of the 8 u16 lanes via stack buffer + copy_from_slice. let mut tmp = [0u16; 8]; _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_vec); @@ -468,7 +482,7 @@ pub(crate) unsafe fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], w let tail_packed = &packed[words * 16..total_words * 16]; let tail_out = &mut luma_out[tail_start_px..width]; let tail_w = width - tail_start_px; - scalar::v210_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::v210_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/y216.rs b/src/row/arch/x86_sse41/y216.rs index a98cdc45..e799caee 100644 --- a/src/row/arch/x86_sse41/y216.rs +++ b/src/row/arch/x86_sse41/y216.rs @@ -48,7 +48,7 @@ use crate::{ColorMatrix, row::scalar}; /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_rgb_or_rgba_row( +pub(crate) unsafe fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -65,160 +65,168 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - let rnd_v = _mm_set1_epi32(RND); - // Y216 samples are full u16 [0..65535]; use i32 y_off and - // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767. - let y_off_v = _mm_set1_epi32(y_off); - let y_scale_v = _mm_set1_epi32(y_scale); - let c_scale_v = _mm_set1_epi32(c_scale); - // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000. - let bias16_v = _mm_set1_epi16(-32768i16); - let cru = _mm_set1_epi32(coeffs.r_u()); - let crv = _mm_set1_epi32(coeffs.r_v()); - let cgu = _mm_set1_epi32(coeffs.g_u()); - let cgv = _mm_set1_epi32(coeffs.g_v()); - let cbu = _mm_set1_epi32(coeffs.b_u()); - let cbv = _mm_set1_epi32(coeffs.b_v()); - let alpha_u8 = _mm_set1_epi8(-1); - - // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each). - // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes. - // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes): - // 0,1 = Y0 2,3 = U0 4,5 = Y1 6,7 = V0 - // 8,9 = Y2 10,11 = U1 12,13 = Y3 14,15 = V1 - // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes. - let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - // U lanes from interleaved [U,V,U,V,...]: even u16 lanes. - let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - // V lanes: odd u16 lanes. - let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------ - // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3 - // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7 - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - - // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi. - let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8 - let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0] - let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8 - - // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group. - let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0] - let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0] - let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3] - - // Split U and V (4 valid low-half lanes each). - let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8 - let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8 - - // Center UV: subtract 32768 wrapping. - let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v); - let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v); - - // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale. - let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3] - let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3] - // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are - // 0x8080 garbage from the -1-byte shuffles, but we don't use them. - // Widen the high half too for `chroma_i16x8` (don't-care input). - let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16)); - let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v)); - - // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3, - // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid. - let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); - - // Duplicate each chroma sample into its Y-pair slot (4:2:2): - // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3] - let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo); - let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo); - let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo); - - // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767. - let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v); - - // Saturating add and narrow to u8. - let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128()); - let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128()); - let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128()); - - // --- hi group: pixels x+8..x+15 --------------------------------- - let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - - let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx); - let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx); - let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - - let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx); - let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx); - let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half); - - let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx); - let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx); - - let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v); - let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v); - - let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16); - let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16)); - let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16)); - - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v)); - - let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); - - let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi); - let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi); - let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi); - - let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v); - - let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128()); - let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128()); - let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128()); - - // Combine two 8-pixel groups into 16-pixel output. - // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes. - // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8. - let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8); - let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8); - let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8); - - if ALPHA { - write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + if !BE { + let rnd_v = _mm_set1_epi32(RND); + // Y216 samples are full u16 [0..65535]; use i32 y_off and + // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767. + let y_off_v = _mm_set1_epi32(y_off); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + // Subtract chroma bias (32768) via wrapping: -32768i16 bits = 0x8000. + let bias16_v = _mm_set1_epi16(-32768i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + let alpha_u8 = _mm_set1_epi8(-1); + + // Byte-level shuffle masks for one 8-pixel group (2 loads of 8 u16 each). + // Each load holds 4 YUYV quadruples = 8 u16 = 16 bytes. + // Byte layout of one load `[Y0,U0,Y1,V0,Y2,U1,Y3,V1]` (bytes): + // 0,1 = Y0 2,3 = U0 4,5 = Y1 6,7 = V0 + // 8,9 = Y2 10,11 = U1 12,13 = Y3 14,15 = V1 + // Y (even u16 lanes): bytes [0,1,4,5,8,9,12,13] → low 8 bytes, high zeroed. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + // Chroma (odd u16 lanes): bytes [2,3,6,7,10,11,14,15] → low 8 bytes. + let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + // U lanes from interleaved [U,V,U,V,...]: even u16 lanes. + let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + // V lanes: odd u16 lanes. + let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 16 <= width { + // --- lo group: pixels x..x+7 (8 pixels, 16 u16 = 2 loads) ------ + // packed[x*2 .. x*2+8] = quadruples 0,1 = pixels x..x+3 + // packed[x*2+8 .. x*2+16] = quadruples 2,3 = pixels x+4..x+7 + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + + // Y extraction: [Y0,Y1,Y2,Y3] from lo and [Y4,Y5,Y6,Y7] from hi. + let y_lo_half = _mm_shuffle_epi8(lo, y_idx); // [Y0,Y1,Y2,Y3, 0,0,0,0] in u16x8 + let y_hi_half = _mm_shuffle_epi8(hi, y_idx); // [Y4,Y5,Y6,Y7, 0,0,0,0] + let y_lo_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] u16x8 + + // Chroma extraction: interleaved [U,V,U,V,...] per 4-pair group. + let c_lo_half = _mm_shuffle_epi8(lo, c_idx); // [U0,V0,U1,V1, 0,0,0,0] + let c_hi_half = _mm_shuffle_epi8(hi, c_idx); // [U2,V2,U3,V3, 0,0,0,0] + let chroma_lo = _mm_unpacklo_epi64(c_lo_half, c_hi_half); // [U0,V0,U1,V1,U2,V2,U3,V3] + + // Split U and V (4 valid low-half lanes each). + let u_lo = _mm_shuffle_epi8(chroma_lo, u_idx); // [U0,U1,U2,U3, 0,0,0,0] u16x8 + let v_lo = _mm_shuffle_epi8(chroma_lo, v_idx); // [V0,V1,V2,V3, 0,0,0,0] u16x8 + + // Center UV: subtract 32768 wrapping. + let u_lo_i16 = _mm_sub_epi16(u_lo, bias16_v); + let v_lo_i16 = _mm_sub_epi16(v_lo, bias16_v); + + // Widen 4 valid i16 chroma lanes to i32x4 for Q15 scale. + let u_lo_i32 = _mm_cvtepi16_epi32(u_lo_i16); // [U0,U1,U2,U3] + let v_lo_i32 = _mm_cvtepi16_epi32(v_lo_i16); // [V0,V1,V2,V3] + // `_mm_cvtepi16_epi32` uses the low 4 lanes; high 4 of u_lo_i16 are + // 0x8080 garbage from the -1-byte shuffles, but we don't use them. + // Widen the high half too for `chroma_i16x8` (don't-care input). + let u_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_lo_i16)); + let v_lo_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_lo_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_hi, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_lo_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_hi, c_scale_v), rnd_v)); + + // chroma_i16x8 takes two i32x4 halves (lo=valid lanes 0..3, + // hi=don't-care lanes 4..7) → produces i16x8 with only lanes 0..3 valid. + let r_chroma_lo = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let g_chroma_lo = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + let b_chroma_lo = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_lo_hi, v_d_lo_hi, rnd_v); + + // Duplicate each chroma sample into its Y-pair slot (4:2:2): + // unpacklo_epi16([c0,c1,c2,c3,...], same) → [c0,c0,c1,c1,c2,c2,c3,c3] + let r_dup_lo = _mm_unpacklo_epi16(r_chroma_lo, r_chroma_lo); + let g_dup_lo = _mm_unpacklo_epi16(g_chroma_lo, g_chroma_lo); + let b_dup_lo = _mm_unpacklo_epi16(b_chroma_lo, b_chroma_lo); + + // Scale Y: unsigned-widening avoids i16 overflow for Y > 32767. + let y_lo_scaled = scale_y_u16(y_lo_vec, y_off_v, y_scale_v, rnd_v); + + // Saturating add and narrow to u8. + let r_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, r_dup_lo), _mm_setzero_si128()); + let g_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, g_dup_lo), _mm_setzero_si128()); + let b_lo_u8 = _mm_packus_epi16(_mm_adds_epi16(y_lo_scaled, b_dup_lo), _mm_setzero_si128()); + + // --- hi group: pixels x+8..x+15 --------------------------------- + let lo2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi2 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + + let y_lo2_half = _mm_shuffle_epi8(lo2, y_idx); + let y_hi2_half = _mm_shuffle_epi8(hi2, y_idx); + let y_hi_vec = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + + let c_lo2_half = _mm_shuffle_epi8(lo2, c_idx); + let c_hi2_half = _mm_shuffle_epi8(hi2, c_idx); + let chroma_hi = _mm_unpacklo_epi64(c_lo2_half, c_hi2_half); + + let u_hi = _mm_shuffle_epi8(chroma_hi, u_idx); + let v_hi = _mm_shuffle_epi8(chroma_hi, v_idx); + + let u_hi_i16 = _mm_sub_epi16(u_hi, bias16_v); + let v_hi_i16 = _mm_sub_epi16(v_hi, bias16_v); + + let u_hi_i32 = _mm_cvtepi16_epi32(u_hi_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(v_hi_i16); + let u_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_hi_i16)); + let v_hi_hi = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_hi_i16)); + + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let u_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_hi, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + let v_d_hi_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_hi, c_scale_v), rnd_v)); + + let r_chroma_hi = chroma_i16x8(cru, crv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + let g_chroma_hi = chroma_i16x8(cgu, cgv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + let b_chroma_hi = chroma_i16x8(cbu, cbv, u_d_hi, v_d_hi, u_d_hi_hi, v_d_hi_hi, rnd_v); + + let r_dup_hi = _mm_unpacklo_epi16(r_chroma_hi, r_chroma_hi); + let g_dup_hi = _mm_unpacklo_epi16(g_chroma_hi, g_chroma_hi); + let b_dup_hi = _mm_unpacklo_epi16(b_chroma_hi, b_chroma_hi); + + let y_hi_scaled = scale_y_u16(y_hi_vec, y_off_v, y_scale_v, rnd_v); + + let r_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, r_dup_hi), _mm_setzero_si128()); + let g_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, g_dup_hi), _mm_setzero_si128()); + let b_hi_u8 = _mm_packus_epi16(_mm_adds_epi16(y_hi_scaled, b_dup_hi), _mm_setzero_si128()); + + // Combine two 8-pixel groups into 16-pixel output. + // Each *_lo_u8 / *_hi_u8 holds 8 valid u8 in its low 8 bytes. + // `_mm_unpacklo_epi64` joins the two low halves → 16 valid u8. + let r_u8 = _mm_unpacklo_epi64(r_lo_u8, r_hi_u8); + let g_u8 = _mm_unpacklo_epi64(g_lo_u8, g_hi_u8); + let b_u8 = _mm_unpacklo_epi64(b_lo_u8, b_hi_u8); + + if ALPHA { + write_rgba_16(r_u8, g_u8, b_u8, alpha_u8, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_16(r_u8, g_u8, b_u8, out.as_mut_ptr().add(x * 3)); + } + + x += 16; } + } // end if !BE - x += 16; - } - - // Scalar tail — remaining < 16 pixels. + // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::y216_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -241,7 +249,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -258,147 +266,149 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( const RND: i64 = 1 << 14; unsafe { - let alpha_u16 = _mm_set1_epi16(-1i16); - let rnd_v = _mm_set1_epi64x(RND); - let rnd32_v = _mm_set1_epi32(1 << 14); - let y_off_v = _mm_set1_epi32(y_off); - let y_scale_v = _mm_set1_epi32(y_scale); - let c_scale_v = _mm_set1_epi32(c_scale); - // bias 32768 via wrapping i16 trick - let bias16_v = _mm_set1_epi16(-32768i16); - let cru = _mm_set1_epi32(coeffs.r_u()); - let crv = _mm_set1_epi32(coeffs.r_v()); - let cgu = _mm_set1_epi32(coeffs.g_u()); - let cgv = _mm_set1_epi32(coeffs.g_v()); - let cbu = _mm_set1_epi32(coeffs.b_u()); - let cbv = _mm_set1_epi32(coeffs.b_v()); - - // Byte-level shuffle masks (same as u8 path). - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - // Two 128-bit loads: each covers 8 u16 = 4 pixels. - // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1] - // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3] - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - - // Y: [Y0..Y7] u16x8 - let y_lo_half = _mm_shuffle_epi8(lo, y_idx); - let y_hi_half = _mm_shuffle_epi8(hi, y_idx); - let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); - - // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3] - let c_lo_half = _mm_shuffle_epi8(lo, c_idx); - let c_hi_half = _mm_shuffle_epi8(hi, c_idx); - let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half); - - // U and V (4 valid low-half lanes each) - let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0] - let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0] - - // Center UV via wrapping i16 subtraction. - let u_i16 = _mm_sub_epi16(u_vec4, bias16_v); - let v_i16 = _mm_sub_epi16(v_vec4, bias16_v); - - // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16). - let u_i32 = _mm_cvtepi16_epi32(u_i16); - let v_i32 = _mm_cvtepi16_epi32(v_i16); - let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v)); - let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v)); - - // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes. - let u_d_even = u_d; - let v_d_even = v_d; - let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even - let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d); - - let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v); - let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v); - let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v); - let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); - let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v); - let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); - - // Reassemble i64x2 pairs (even + odd) → i32x4. - let r_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(r_ch_even, r_ch_odd), - _mm_unpackhi_epi32(r_ch_even, r_ch_odd), - ); - let g_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(g_ch_even, g_ch_odd), - _mm_unpackhi_epi32(g_ch_even, g_ch_odd), - ); - let b_ch_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(b_ch_even, b_ch_odd), - _mm_unpackhi_epi32(b_ch_even, b_ch_odd), - ); - - // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2). - // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3) - // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7) - let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32); - let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32); - let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32); - let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32); - let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32); - let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32); - - // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. - let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32 - let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7] - let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v); - let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v); - - // Even/odd split for _mm_mul_epi32. - let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v); - let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v); - let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v); - let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v); - - // Reassemble Y i64x2 pairs to i32x4. - let y_lo_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(y_lo_even, y_lo_odd), - _mm_unpackhi_epi32(y_lo_even, y_lo_odd), - ); - let y_hi_i32 = _mm_unpacklo_epi64( - _mm_unpacklo_epi32(y_hi_even, y_hi_odd), - _mm_unpackhi_epi32(y_hi_even, y_hi_odd), - ); - - // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32. - let r_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, r_dup_lo), - _mm_add_epi32(y_hi_i32, r_dup_hi), - ); - let g_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, g_dup_lo), - _mm_add_epi32(y_hi_i32, g_dup_hi), - ); - let b_u16 = _mm_packus_epi32( - _mm_add_epi32(y_lo_i32, b_dup_lo), - _mm_add_epi32(y_hi_i32, b_dup_hi), - ); - - if ALPHA { - write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + if !BE { + let alpha_u16 = _mm_set1_epi16(-1i16); + let rnd_v = _mm_set1_epi64x(RND); + let rnd32_v = _mm_set1_epi32(1 << 14); + let y_off_v = _mm_set1_epi32(y_off); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + // bias 32768 via wrapping i16 trick + let bias16_v = _mm_set1_epi16(-32768i16); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + // Byte-level shuffle masks (same as u8 path). + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + let c_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + let u_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + let v_idx = _mm_setr_epi8(2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + // Two 128-bit loads: each covers 8 u16 = 4 pixels. + // packed[x*2 .. x*2+8] = [Y0,U0,Y1,V0,Y2,U1,Y3,V1] + // packed[x*2+8 .. x*2+16] = [Y4,U2,Y5,V2,Y6,U3,Y7,V3] + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + + // Y: [Y0..Y7] u16x8 + let y_lo_half = _mm_shuffle_epi8(lo, y_idx); + let y_hi_half = _mm_shuffle_epi8(hi, y_idx); + let y_vec = _mm_unpacklo_epi64(y_lo_half, y_hi_half); + + // UV interleaved: [U0,V0,U1,V1,U2,V2,U3,V3] + let c_lo_half = _mm_shuffle_epi8(lo, c_idx); + let c_hi_half = _mm_shuffle_epi8(hi, c_idx); + let chroma = _mm_unpacklo_epi64(c_lo_half, c_hi_half); + + // U and V (4 valid low-half lanes each) + let u_vec4 = _mm_shuffle_epi8(chroma, u_idx); // [U0,U1,U2,U3, 0,0,0,0] + let v_vec4 = _mm_shuffle_epi8(chroma, v_idx); // [V0,V1,V2,V3, 0,0,0,0] + + // Center UV via wrapping i16 subtraction. + let u_i16 = _mm_sub_epi16(u_vec4, bias16_v); + let v_i16 = _mm_sub_epi16(v_vec4, bias16_v); + + // Scale UV in i32 (4 valid lanes from low half of u_i16/v_i16). + let u_i32 = _mm_cvtepi16_epi32(u_i16); + let v_i32 = _mm_cvtepi16_epi32(v_i16); + let u_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_i32, c_scale_v), rnd32_v)); + let v_d = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_i32, c_scale_v), rnd32_v)); + + // i64 chroma: _mm_mul_epi32 uses even-indexed i32 lanes. + let u_d_even = u_d; + let v_d_even = v_d; + let u_d_odd = _mm_shuffle_epi32::<0xF5>(u_d); // [1,1,3,3] → odd to even + let v_d_odd = _mm_shuffle_epi32::<0xF5>(v_d); + + let r_ch_even = chroma_i64x2(cru, crv, u_d_even, v_d_even, rnd_v); + let r_ch_odd = chroma_i64x2(cru, crv, u_d_odd, v_d_odd, rnd_v); + let g_ch_even = chroma_i64x2(cgu, cgv, u_d_even, v_d_even, rnd_v); + let g_ch_odd = chroma_i64x2(cgu, cgv, u_d_odd, v_d_odd, rnd_v); + let b_ch_even = chroma_i64x2(cbu, cbv, u_d_even, v_d_even, rnd_v); + let b_ch_odd = chroma_i64x2(cbu, cbv, u_d_odd, v_d_odd, rnd_v); + + // Reassemble i64x2 pairs (even + odd) → i32x4. + let r_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(r_ch_even, r_ch_odd), + _mm_unpackhi_epi32(r_ch_even, r_ch_odd), + ); + let g_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(g_ch_even, g_ch_odd), + _mm_unpackhi_epi32(g_ch_even, g_ch_odd), + ); + let b_ch_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(b_ch_even, b_ch_odd), + _mm_unpackhi_epi32(b_ch_even, b_ch_odd), + ); + + // Duplicate each chroma value for 2 Y pixels per chroma pair (4:2:2). + // unpacklo_epi32([r0,r1,r2,r3], same) → [r0,r0,r1,r1] (pixels 0,1,2,3) + // unpackhi_epi32([r0,r1,r2,r3], same) → [r2,r2,r3,r3] (pixels 4,5,6,7) + let r_dup_lo = _mm_unpacklo_epi32(r_ch_i32, r_ch_i32); + let r_dup_hi = _mm_unpackhi_epi32(r_ch_i32, r_ch_i32); + let g_dup_lo = _mm_unpacklo_epi32(g_ch_i32, g_ch_i32); + let g_dup_hi = _mm_unpackhi_epi32(g_ch_i32, g_ch_i32); + let b_dup_lo = _mm_unpacklo_epi32(b_ch_i32, b_ch_i32); + let b_dup_hi = _mm_unpackhi_epi32(b_ch_i32, b_ch_i32); + + // Y: unsigned-widen u16 → i32, subtract y_off, scale via i64. + let y_lo_pair = _mm_cvtepu16_epi32(y_vec); // [y0,y1,y2,y3] as i32 + let y_hi_pair = _mm_cvtepu16_epi32(_mm_srli_si128::<8>(y_vec)); // [y4,y5,y6,y7] + let y_lo_sub = _mm_sub_epi32(y_lo_pair, y_off_v); + let y_hi_sub = _mm_sub_epi32(y_hi_pair, y_off_v); + + // Even/odd split for _mm_mul_epi32. + let y_lo_even = scale_y16_i64(y_lo_sub, y_scale_v, rnd_v); + let y_lo_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_lo_sub), y_scale_v, rnd_v); + let y_hi_even = scale_y16_i64(y_hi_sub, y_scale_v, rnd_v); + let y_hi_odd = scale_y16_i64(_mm_shuffle_epi32::<0xF5>(y_hi_sub), y_scale_v, rnd_v); + + // Reassemble Y i64x2 pairs to i32x4. + let y_lo_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(y_lo_even, y_lo_odd), + _mm_unpackhi_epi32(y_lo_even, y_lo_odd), + ); + let y_hi_i32 = _mm_unpacklo_epi64( + _mm_unpacklo_epi32(y_hi_even, y_hi_odd), + _mm_unpackhi_epi32(y_hi_even, y_hi_odd), + ); + + // Add Y + chroma, saturate i32 → u16 via _mm_packus_epi32. + let r_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, r_dup_lo), + _mm_add_epi32(y_hi_i32, r_dup_hi), + ); + let g_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, g_dup_lo), + _mm_add_epi32(y_hi_i32, g_dup_hi), + ); + let b_u16 = _mm_packus_epi32( + _mm_add_epi32(y_lo_i32, b_dup_lo), + _mm_add_epi32(y_hi_i32, b_dup_hi), + ); + + if ALPHA { + write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); + } + + x += 8; } + } // end if !BE - x += 8; - } - - // Scalar tail — remaining < 8 pixels. + // Scalar tail — remaining < 8 pixels, or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y216_to_rgb_u16_or_rgba_u16_row::( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -423,49 +433,55 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn y216_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { - // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - // Four loads covering 16 pixels (16 u16 per load pair). - // packed offset x*2 = quadruple-base for pixel x. - // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15. - let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - - // Extract Y lanes into u16x8. - let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..] - let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..] - let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] - - let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..] - let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..] - let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - - // `>> 8` to get u8 luma (high byte of each Y sample). - let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo); - let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi); - // Pack 16 × i16 → 16 × u8. - let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr); - _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8); - - x += 16; + if !BE { + // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 16 <= width { + // Four loads covering 16 pixels (16 u16 per load pair). + // packed offset x*2 = quadruple-base for pixel x. + // lo0/hi0 cover pixels x..x+7, lo1/hi1 cover x+8..x+15. + let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + + // Extract Y lanes into u16x8. + let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); // [Y0..Y3, 0..] + let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); // [Y4..Y7, 0..] + let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] + + let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); // [Y8..Y11, 0..] + let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); // [Y12..Y15, 0..] + let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + + // `>> 8` to get u8 luma (high byte of each Y sample). + let y_lo_shr = _mm_srli_epi16::<8>(y_vec_lo); + let y_hi_shr = _mm_srli_epi16::<8>(y_vec_hi); + // Pack 16 × i16 → 16 × u8. + let y_u8 = _mm_packus_epi16(y_lo_shr, y_hi_shr); + _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_u8); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -484,41 +500,47 @@ pub(crate) unsafe fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 4. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn y216_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); unsafe { - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 16 <= width { - let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); - let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); + if !BE { + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); - let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); - let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] + while x + 16 <= width { + let lo0 = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi0 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let lo1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 16).cast()); + let hi1 = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 24).cast()); - let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); - let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); - let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] + let y_lo_half = _mm_shuffle_epi8(lo0, y_idx); + let y_hi_half = _mm_shuffle_epi8(hi0, y_idx); + let y_vec_lo = _mm_unpacklo_epi64(y_lo_half, y_hi_half); // [Y0..Y7] - // Direct copy — full 16-bit Y values, no shift. - _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo); - _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi); + let y_lo2_half = _mm_shuffle_epi8(lo1, y_idx); + let y_hi2_half = _mm_shuffle_epi8(hi1, y_idx); + let y_vec_hi = _mm_unpacklo_epi64(y_lo2_half, y_hi2_half); // [Y8..Y15] - x += 16; + // Direct copy — full 16-bit Y values, no shift. + _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec_lo); + _mm_storeu_si128(out.as_mut_ptr().add(x + 8).cast(), y_vec_hi); + + x += 16; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x..width]; let tail_w = width - x; - scalar::y216_to_luma_u16_row(tail_packed, tail_out, tail_w); + scalar::y216_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/arch/x86_sse41/y2xx.rs b/src/row/arch/x86_sse41/y2xx.rs index eaa88f7e..e8e18aff 100644 --- a/src/row/arch/x86_sse41/y2xx.rs +++ b/src/row/arch/x86_sse41/y2xx.rs @@ -130,7 +130,11 @@ unsafe fn unpack_y2xx_8px_sse41( /// 4. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -158,111 +162,114 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row(u_i16)); - let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - - // 8-lane chroma vectors with valid data in lanes 0..3. - let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via - // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with - // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3]. - let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); - let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); - let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); - - // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits - // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero - // for hi) so the low 8 bytes of the result hold the saturated - // u8 of the input i16x8. Only the first 8 bytes per channel - // matter. - let zero = _mm_setzero_si128(); - let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero); - let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero); - let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero); - - // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16` - // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter - // body we use the v210-style stack-buffer + scalar interleave - // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.) - let mut r_tmp = [0u8; 16]; - let mut g_tmp = [0u8; 16]; - let mut b_tmp = [0u8; 16]; - _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8); - _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8); - _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8); - - if ALPHA { - let dst = &mut out[x * 4..x * 4 + 8 * 4]; - for i in 0..8 { - dst[i * 4] = r_tmp[i]; - dst[i * 4 + 1] = g_tmp[i]; - dst[i * 4 + 2] = b_tmp[i]; - dst[i * 4 + 3] = 0xFF; - } - } else { - let dst = &mut out[x * 3..x * 3 + 8 * 3]; - for i in 0..8 { - dst[i * 3] = r_tmp[i]; - dst[i * 3 + 1] = g_tmp[i]; - dst[i * 3 + 2] = b_tmp[i]; + if !BE { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + // Loop-invariant runtime shift count for `_mm_srl_epi16`, see + // module-level note. + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + + // Subtract chroma bias (e.g. 512 for 10-bit) — fits i16 since + // each chroma sample is ≤ 2^BITS - 1 ≤ 4095. + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + // Widen 8-lane i16 chroma to two i32x4 halves so the Q15 + // multiplies don't overflow. Only lanes 0..3 of `_lo` are + // valid; `_hi` is entirely don't-care. We feed both halves + // through `chroma_i16x8` to recycle the helper exactly; the + // don't-care output lanes are discarded by the + // `_mm_unpacklo_epi16` duplicate step below (which only consumes + // lanes 0..3). + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + // 8-lane chroma vectors with valid data in lanes 0..3. + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + // Each chroma sample covers 2 Y lanes (4:2:2): duplicate via + // `_mm_unpacklo_epi16` so lanes 0..7 of `r_dup` align with + // Y0..Y7. Lane order: [c0, c0, c1, c1, c2, c2, c3, c3]. + let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); + let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); + let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); + + // Y scale: `(Y - y_off) * y_scale + RND >> 15` → i16x8. + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // u8 narrow with saturation. `_mm_packus_epi16(lo, hi)` emits + // 16 u8 lanes from 16 i16 lanes; we feed `lo == hi` (or zero + // for hi) so the low 8 bytes of the result hold the saturated + // u8 of the input i16x8. Only the first 8 bytes per channel + // matter. + let zero = _mm_setzero_si128(); + let r_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, r_dup), zero); + let g_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, g_dup), zero); + let b_u8 = _mm_packus_epi16(_mm_adds_epi16(y_scaled, b_dup), zero); + + // 8-pixel partial store: SSE4.1's `write_rgb_16` / `write_rgba_16` + // emit 16-pixel output (48 / 64 bytes), so for the 8-px-iter + // body we use the v210-style stack-buffer + scalar interleave + // pattern. (8 px × 3 = 24 bytes RGB, 8 px × 4 = 32 bytes RGBA.) + let mut r_tmp = [0u8; 16]; + let mut g_tmp = [0u8; 16]; + let mut b_tmp = [0u8; 16]; + _mm_storeu_si128(r_tmp.as_mut_ptr().cast(), r_u8); + _mm_storeu_si128(g_tmp.as_mut_ptr().cast(), g_u8); + _mm_storeu_si128(b_tmp.as_mut_ptr().cast(), b_u8); + + if ALPHA { + let dst = &mut out[x * 4..x * 4 + 8 * 4]; + for i in 0..8 { + dst[i * 4] = r_tmp[i]; + dst[i * 4 + 1] = g_tmp[i]; + dst[i * 4 + 2] = b_tmp[i]; + dst[i * 4 + 3] = 0xFF; + } + } else { + let dst = &mut out[x * 3..x * 3 + 8 * 3]; + for i in 0..8 { + dst[i * 3] = r_tmp[i]; + dst[i * 3 + 1] = g_tmp[i]; + dst[i * 3 + 2] = b_tmp[i]; + } } - } - x += 8; - } + x += 8; + } + } // end if !BE - // Scalar tail — remaining < 8 pixels (always even per 4:2:2). + // Scalar tail — remaining < 8 pixels (always even per 4:2:2), + // or full-row fallback when BE=true. if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_or_rgba_row::( + scalar::y2xx_n_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -288,7 +295,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -314,72 +325,74 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row(u_i16)); - let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); - let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); - - let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); - let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); - let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); - let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); - - let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); - - let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); - let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); - let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); - - let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); - - // Native-depth output: clamp to [0, (1 << BITS) - 1]. - // `_mm_adds_epi16` saturates at i16 bounds (no-op here since - // |sum| stays well inside i16 for BITS ≤ 12), then min/max - // clamps to the BITS range. - let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v); - let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v); - let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v); - - if ALPHA { - let alpha = _mm_set1_epi16(out_max); - write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4)); - } else { - write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3)); - } + if !BE { + let rnd_v = _mm_set1_epi32(RND); + let y_off_v = _mm_set1_epi16(y_off as i16); + let y_scale_v = _mm_set1_epi32(y_scale); + let c_scale_v = _mm_set1_epi32(c_scale); + let bias_v = _mm_set1_epi16(bias as i16); + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let max_v = _mm_set1_epi16(out_max); + let zero_v = _mm_set1_epi16(0); + let cru = _mm_set1_epi32(coeffs.r_u()); + let crv = _mm_set1_epi32(coeffs.r_v()); + let cgu = _mm_set1_epi32(coeffs.g_u()); + let cgv = _mm_set1_epi32(coeffs.g_v()); + let cbu = _mm_set1_epi32(coeffs.b_u()); + let cbv = _mm_set1_epi32(coeffs.b_v()); + + while x + 8 <= width { + let (y_vec, u_vec, v_vec) = unpack_y2xx_8px_sse41(packed.as_ptr().add(x * 2), shr_count); + + let y_i16 = y_vec; + let u_i16 = _mm_sub_epi16(u_vec, bias_v); + let v_i16 = _mm_sub_epi16(v_vec, bias_v); + + let u_lo_i32 = _mm_cvtepi16_epi32(u_i16); + let u_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(u_i16)); + let v_lo_i32 = _mm_cvtepi16_epi32(v_i16); + let v_hi_i32 = _mm_cvtepi16_epi32(_mm_srli_si128::<8>(v_i16)); + + let u_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_lo_i32, c_scale_v), rnd_v)); + let u_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(u_hi_i32, c_scale_v), rnd_v)); + let v_d_lo = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_lo_i32, c_scale_v), rnd_v)); + let v_d_hi = q15_shift(_mm_add_epi32(_mm_mullo_epi32(v_hi_i32, c_scale_v), rnd_v)); + + let r_chroma = chroma_i16x8(cru, crv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let g_chroma = chroma_i16x8(cgu, cgv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + let b_chroma = chroma_i16x8(cbu, cbv, u_d_lo, v_d_lo, u_d_hi, v_d_hi, rnd_v); + + let r_dup = _mm_unpacklo_epi16(r_chroma, r_chroma); + let g_dup = _mm_unpacklo_epi16(g_chroma, g_chroma); + let b_dup = _mm_unpacklo_epi16(b_chroma, b_chroma); + + let y_scaled = scale_y(y_i16, y_off_v, y_scale_v, rnd_v); + + // Native-depth output: clamp to [0, (1 << BITS) - 1]. + // `_mm_adds_epi16` saturates at i16 bounds (no-op here since + // |sum| stays well inside i16 for BITS ≤ 12), then min/max + // clamps to the BITS range. + let r = clamp_u16_max(_mm_adds_epi16(y_scaled, r_dup), zero_v, max_v); + let g = clamp_u16_max(_mm_adds_epi16(y_scaled, g_dup), zero_v, max_v); + let b = clamp_u16_max(_mm_adds_epi16(y_scaled, b_dup), zero_v, max_v); + + if ALPHA { + let alpha = _mm_set1_epi16(out_max); + write_rgba_u16_8(r, g, b, alpha, out.as_mut_ptr().add(x * 4)); + } else { + write_rgb_u16_8(r, g, b, out.as_mut_ptr().add(x * 3)); + } - x += 8; - } + x += 8; + } + } // end if !BE if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( + scalar::y2xx_n_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -405,7 +418,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_luma_row( +pub(crate) unsafe fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -422,39 +435,41 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - // Y permute mask: pick even u16 lanes (low byte at [0], high byte - // at [1]) into the low 8 bytes; high 8 bytes zeroed. - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _] - let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _] - let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned - - // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for - // any BITS ∈ {10, 12} — same single-shift simplification used - // by NEON's `vshrn_n_u16::<8>`. - // `_mm_srli_epi16::<8>` has a literal const count, so it works - // here without the runtime-count helper. - let y_shr = _mm_srli_epi16::<8>(y_vec); - // Pack 8 i16 lanes to u8 — only low 8 bytes used. - let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); - // Store low 8 bytes via stack buffer + copy_from_slice. - let mut tmp = [0u8; 16]; - _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8); - luma_out[x..x + 8].copy_from_slice(&tmp[..8]); - - x += 8; + if !BE { + // Y permute mask: pick even u16 lanes (low byte at [0], high byte + // at [1]) into the low 8 bytes; high 8 bytes zeroed. + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = _mm_shuffle_epi8(lo, y_idx); // [Y0..Y3, _, _, _, _] + let y_hi = _mm_shuffle_epi8(hi, y_idx); // [Y4..Y7, _, _, _, _] + let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); // [Y0..Y7] MSB-aligned + + // `>> (16 - BITS)` then `>> (BITS - 8)` collapses to `>> 8` for + // any BITS ∈ {10, 12} — same single-shift simplification used + // by NEON's `vshrn_n_u16::<8>`. + // `_mm_srli_epi16::<8>` has a literal const count, so it works + // here without the runtime-count helper. + let y_shr = _mm_srli_epi16::<8>(y_vec); + // Pack 8 i16 lanes to u8 — only low 8 bytes used. + let y_u8 = _mm_packus_epi16(y_shr, _mm_setzero_si128()); + // Store low 8 bytes via stack buffer + copy_from_slice. + let mut tmp = [0u8; 16]; + _mm_storeu_si128(tmp.as_mut_ptr().cast(), y_u8); + luma_out[x..x + 8].copy_from_slice(&tmp[..8]); + + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_row::(tail_packed, tail_out, tail_w); } } } @@ -471,7 +486,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( /// 4. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn y2xx_n_to_luma_u16_row( +pub(crate) unsafe fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -488,28 +503,30 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { - let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); - let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); - let mut x = 0usize; - while x + 8 <= width { - let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); - let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); - let y_lo = _mm_shuffle_epi8(lo, y_idx); - let y_hi = _mm_shuffle_epi8(hi, y_idx); - let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); - // Right-shift by `(16 - BITS)` to bring MSB-aligned samples - // into low-bit-packed form for the native-depth u16 output. - let y_low = _mm_srl_epi16(y_vec, shr_count); - _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low); - x += 8; + if !BE { + let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); + let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); + + while x + 8 <= width { + let lo = _mm_loadu_si128(packed.as_ptr().add(x * 2).cast()); + let hi = _mm_loadu_si128(packed.as_ptr().add(x * 2 + 8).cast()); + let y_lo = _mm_shuffle_epi8(lo, y_idx); + let y_hi = _mm_shuffle_epi8(hi, y_idx); + let y_vec = _mm_unpacklo_epi64(y_lo, y_hi); + // Right-shift by `(16 - BITS)` to bring MSB-aligned samples + // into low-bit-packed form for the native-depth u16 output. + let y_low = _mm_srl_epi16(y_vec, shr_count); + _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_low); + x += 8; + } } if x < width { let tail_packed = &packed[x * 2..width * 2]; let tail_out = &mut luma_out[x..width]; let tail_w = width - x; - scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); + scalar::y2xx_n_to_luma_u16_row::(tail_packed, tail_out, tail_w); } } } diff --git a/src/row/dispatch/v210.rs b/src/row/dispatch/v210.rs index 2760c4b2..7d1c14b1 100644 --- a/src/row/dispatch/v210.rs +++ b/src/row/dispatch/v210.rs @@ -7,8 +7,8 @@ //! block; `use_simd = false` forces scalar. //! //! The per-format SIMD kernels are const-generic on `ALPHA` -//! (`v210_to_rgb_or_rgba_row::` / -//! `v210_to_rgb_u16_or_rgba_u16_row::`) — the public +//! (`v210_to_rgb_or_rgba_row::` / +//! `v210_to_rgb_u16_or_rgba_u16_row::`) — the public //! dispatchers split them into RGB vs. RGBA entries by hard-wiring //! `ALPHA = false` / `true`. @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of v210 to packed RGB (u8). See /// [`scalar::v210_to_rgb_or_rgba_row`] for byte layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (32-bit words stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn v210_to_rgb_row( packed: &[u8], @@ -40,6 +41,7 @@ pub fn v210_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn v210_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn v210_to_rgb_row( } } - scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::v210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of v210 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn v210_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn v210_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn v210_to_rgba_row( } } - scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::v210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of v210 to packed `u16` RGB at native 10-bit @@ -167,6 +218,7 @@ pub fn v210_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn v210_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn v210_to_rgb_u16_row( } } - scalar::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of v210 to packed `u16` RGBA at native 10-bit @@ -231,6 +311,7 @@ pub fn v210_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn v210_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn v210_to_rgba_u16_row( } } - scalar::v210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::v210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed v210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn v210_to_luma_row( + packed: &[u8], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "v210 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::neon::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::v210_to_luma_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::v210_to_luma_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::v210_to_luma_row::(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_si } } - scalar::v210_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::v210_to_luma_row::(packed, luma_out, width), + scalar::v210_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed v210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "v210 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::neon::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v210_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::v210_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::v210_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, u } } - scalar::v210_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::v210_to_luma_u16_row::(packed, luma_out, width), + scalar::v210_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -435,7 +604,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 6 * 3]; - v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false); + v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -444,7 +613,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 6 * 4]; - v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false); + v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -452,7 +621,15 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 6 * 3]; - v210_to_rgb_u16_row(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true, false); + v210_to_rgb_u16_row( + &word, + &mut rgb_u16, + 6, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -461,21 +638,29 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 6 * 4]; - v210_to_rgba_u16_row(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true, false); + v210_to_rgba_u16_row( + &word, + &mut rgba_u16, + 6, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 6]; - v210_to_luma_row(&word, &mut luma, 6, false); + v210_to_luma_row(&word, &mut luma, 6, false, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 6]; - v210_to_luma_u16_row(&word, &mut luma_u16, 6, false); + v210_to_luma_u16_row(&word, &mut luma_u16, 6, false, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y210.rs b/src/row/dispatch/y210.rs index e9ab9eca..97bd0766 100644 --- a/src/row/dispatch/y210.rs +++ b/src/row/dispatch/y210.rs @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of Y210 to packed RGB (u8). See /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y210_to_rgb_row( packed: &[u16], @@ -40,6 +41,7 @@ pub fn y210_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn y210_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn y210_to_rgb_row( } } - scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y210_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y210 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn y210_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn y210_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn y210_to_rgba_row( } } - scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y210_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y210 to packed `u16` RGB at native 10-bit @@ -167,6 +218,7 @@ pub fn y210_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn y210_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn y210_to_rgb_u16_row( } } - scalar::y210_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y210 to packed `u16` RGBA at native 10-bit @@ -231,6 +311,7 @@ pub fn y210_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn y210_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn y210_to_rgba_u16_row( } } - scalar::y210_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y210_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y210_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y210 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<10, true>(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y210_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y210_to_luma_row::(packed, luma_out, width), + scalar::y210_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y210_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y210 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<10, true>(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y210_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y210_to_luma_u16_row::(packed, luma_out, width), + scalar::y210_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -433,7 +602,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -442,7 +611,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -450,7 +619,15 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y210_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y210_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -459,21 +636,29 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 8 * 4]; - y210_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y210_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 8]; - y210_to_luma_row(&buf, &mut luma, 8, false); + y210_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 8]; - y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y212.rs b/src/row/dispatch/y212.rs index aa253721..2245c50e 100644 --- a/src/row/dispatch/y212.rs +++ b/src/row/dispatch/y212.rs @@ -31,7 +31,8 @@ use crate::{ /// Converts one row of Y212 to packed RGB (u8). See /// [`scalar::y2xx_n_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y212_to_rgb_row( packed: &[u16], @@ -40,6 +41,7 @@ pub fn y212_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -54,36 +56,57 @@ pub fn y212_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -91,7 +114,10 @@ pub fn y212_to_rgb_row( } } - scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y212_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y212 to packed RGBA (u8) with `α = 0xFF`. @@ -103,6 +129,7 @@ pub fn y212_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -117,36 +144,57 @@ pub fn y212_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_or_rgba_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -154,7 +202,10 @@ pub fn y212_to_rgba_row( } } - scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y212_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y212 to packed `u16` RGB at native 12-bit @@ -167,6 +218,7 @@ pub fn y212_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -181,36 +233,57 @@ pub fn y212_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false>(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, false>(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, false, true>(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -218,7 +291,14 @@ pub fn y212_to_rgb_u16_row( } } - scalar::y212_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y212 to packed `u16` RGBA at native 12-bit @@ -231,6 +311,7 @@ pub fn y212_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -245,36 +326,57 @@ pub fn y212_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true>(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, false>(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, true, true>(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -282,13 +384,26 @@ pub fn y212_to_rgba_u16_row( } } - scalar::y212_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y212_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y212 buffer. /// Y values are downshifted from 12-bit to 8-bit via `>> 4`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y212_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y212 requires even width (4:2:2 chroma pair)" @@ -299,36 +414,57 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_row::<12, true>(packed, luma_out, width); } + ); return; } }, @@ -336,14 +472,23 @@ pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y212_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y212_to_luma_row::(packed, luma_out, width), + scalar::y212_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y212 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in /// its low 12 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y212_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y212 requires even width (4:2:2 chroma pair)" @@ -354,36 +499,57 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::neon::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, false>(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y2xx_n_to_luma_u16_row::<12, true>(packed, luma_out, width); } + ); return; } }, @@ -391,7 +557,10 @@ pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y212_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y212_to_luma_u16_row::(packed, luma_out, width), + scalar::y212_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -433,7 +602,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -442,7 +611,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -450,7 +619,15 @@ mod tests { // u16 RGB at native 12-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y212_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y212_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(2048) <= 2); assert_eq!(px[0], px[1]); @@ -459,21 +636,29 @@ mod tests { // u16 RGBA — alpha = 4095. let mut rgba_u16 = [0u16; 8 * 4]; - y212_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y212_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 4095); } // u8 luma — Y=2048 → 128 after `>> 4`. let mut luma = [0u8; 8]; - y212_to_luma_row(&buf, &mut luma, 8, false); + y212_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (2048u16 >> 4) as u8); } // u16 luma — low-packed 12-bit Y. let mut luma_u16 = [0u16; 8]; - y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 2048); } diff --git a/src/row/dispatch/y216.rs b/src/row/dispatch/y216.rs index 9f0fc6de..541022c7 100644 --- a/src/row/dispatch/y216.rs +++ b/src/row/dispatch/y216.rs @@ -30,7 +30,8 @@ use crate::{ /// Converts one row of Y216 to packed RGB (u8). See /// [`scalar::y216_to_rgb_or_rgba_row`] for sample layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `big_endian = true` selects +/// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] pub fn y216_to_rgb_row( packed: &[u16], @@ -39,6 +40,7 @@ pub fn y216_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -53,36 +55,57 @@ pub fn y216_to_rgb_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -90,7 +113,10 @@ pub fn y216_to_rgb_row( } } - scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range), + scalar::y216_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range) + ); } /// Converts one row of Y216 to packed RGBA (u8) with `α = 0xFF`. @@ -102,6 +128,7 @@ pub fn y216_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -116,36 +143,57 @@ pub fn y216_to_rgba_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -153,7 +201,10 @@ pub fn y216_to_rgba_row( } } - scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range), + scalar::y216_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range) + ); } /// Converts one row of Y216 to packed `u16` RGB at native 16-bit @@ -166,6 +217,7 @@ pub fn y216_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -180,36 +232,57 @@ pub fn y216_to_rgb_u16_row( "rgb_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -217,7 +290,14 @@ pub fn y216_to_rgb_u16_row( } } - scalar::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ), + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range + ) + ); } /// Converts one row of Y216 to packed `u16` RGBA at native 16-bit @@ -230,6 +310,7 @@ pub fn y216_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert!( width.is_multiple_of(2), @@ -244,36 +325,57 @@ pub fn y216_to_rgba_u16_row( "rgba_out row too short" ); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -281,13 +383,26 @@ pub fn y216_to_rgba_u16_row( } } - scalar::y216_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ), + scalar::y216_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range + ) + ); } /// Extracts one row of 8-bit luma from a packed Y216 buffer. /// Y values are downshifted from 16-bit to 8-bit via `>> 8`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn y216_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y216 requires even width (4:2:2 chroma pair)" @@ -298,36 +413,57 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::neon::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y216_to_luma_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y216_to_luma_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_luma_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_luma_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y216_to_luma_row::(packed, luma_out, width); } + ); return; } }, @@ -335,13 +471,22 @@ pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::y216_to_luma_row(packed, luma_out, width); + dispatch_be!( + scalar::y216_to_luma_row::(packed, luma_out, width), + scalar::y216_to_luma_row::(packed, luma_out, width) + ); } /// Extracts one row of native-depth `u16` luma from a packed Y216 /// buffer (full-range: each `u16` carries the 16-bit Y value directly). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn y216_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + big_endian: bool, +) { assert!( width.is_multiple_of(2), "Y216 requires even width (4:2:2 chroma pair)" @@ -352,36 +497,57 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, ); assert!(luma_out.len() >= width, "luma_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::neon::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::neon::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx512::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx512::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_avx2::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_avx2::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::x86_sse41::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::x86_sse41::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::y216_to_luma_u16_row(packed, luma_out, width); } + dispatch_be!( + unsafe { arch::wasm_simd128::y216_to_luma_u16_row::(packed, luma_out, width); }, + unsafe { arch::wasm_simd128::y216_to_luma_u16_row::(packed, luma_out, width); } + ); return; } }, @@ -389,7 +555,10 @@ pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::y216_to_luma_u16_row(packed, luma_out, width); + dispatch_be!( + scalar::y216_to_luma_u16_row::(packed, luma_out, width), + scalar::y216_to_luma_u16_row::(packed, luma_out, width) + ); } #[cfg(all(test, feature = "std"))] @@ -431,7 +600,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -440,7 +609,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -448,7 +617,15 @@ mod tests { // u16 RGB at native 16-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y216_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + y216_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(32768) <= 4); assert_eq!(px[0], px[1]); @@ -457,21 +634,29 @@ mod tests { // u16 RGBA — alpha = 0xFFFF. let mut rgba_u16 = [0u16; 8 * 4]; - y216_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + y216_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 0xFFFF); } // u8 luma — Y=32768 → 128 after `>> 8`. let mut luma = [0u8; 8]; - y216_to_luma_row(&buf, &mut luma, 8, false); + y216_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (32768u16 >> 8) as u8); } // u16 luma — full 16-bit Y value. let mut luma_u16 = [0u16; 8]; - y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 32768); } @@ -483,7 +668,7 @@ mod tests { // packed buffer has only 2 elements for width=4 (needs 8). let packed = [0u16; 2]; let mut rgb = [0u8; 4 * 3]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -492,7 +677,7 @@ mod tests { // output buffer has only 2 bytes for width=4 (needs 12). let packed = [0u16; 8]; let mut rgb = [0u8; 2]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -500,7 +685,7 @@ mod tests { fn y216_dispatcher_rejects_odd_width() { let packed = [0u16; 6]; let mut rgb = [0u8; 9]; - y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false); + y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false, false); } #[test] @@ -521,6 +706,7 @@ mod tests { ColorMatrix::Bt709, true, false, + false, ); } } diff --git a/src/row/mod.rs b/src/row/mod.rs index b2502de8..0a476276 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -944,7 +944,15 @@ mod overflow_tests { let candidate = ((usize::MAX / 16) + 1) * 6; let p: [u8; 0] = []; let mut rgb: [u8; 0] = []; - v210_to_rgb_row(&p, &mut rgb, candidate, ColorMatrix::Bt601, true, false); + v210_to_rgb_row( + &p, + &mut rgb, + candidate, + ColorMatrix::Bt601, + true, + false, + false, + ); } // ---- Y2xx dispatcher — `width × 2` overflow ---- @@ -974,6 +982,7 @@ mod overflow_tests { ColorMatrix::Bt601, true, false, + false, ); } } diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs index b0e390ee..46326468 100644 --- a/src/row/scalar/mod.rs +++ b/src/row/scalar/mod.rs @@ -123,6 +123,42 @@ pub(crate) use yuv_planar_high_bit::*; // ---- Shared scalar helpers (used across all conversion families) ------- +/// Reads one `u16` from the byte address `ptr` in the endianness +/// indicated by `BE`. `BE = false` → little-endian (native v210/Y2xx +/// on-wire format); `BE = true` → big-endian. The unused branch is +/// eliminated by the compiler when the caller is monomorphized. +/// +/// # Safety +/// +/// `ptr` must point to at least 2 readable bytes. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) unsafe fn load_endian_u16(ptr: *const u8) -> u16 { + let bytes = unsafe { [*ptr, *ptr.add(1)] }; + if BE { + u16::from_be_bytes(bytes) + } else { + u16::from_le_bytes(bytes) + } +} + +/// Reads one `u32` from the byte address `ptr` in the endianness +/// indicated by `BE`. `BE = false` → little-endian; `BE = true` → +/// big-endian. The unused branch is eliminated by the compiler when +/// the caller is monomorphized. +/// +/// # Safety +/// +/// `ptr` must point to at least 4 readable bytes. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) unsafe fn load_endian_u32(ptr: *const u8) -> u32 { + let bytes = unsafe { [*ptr, *ptr.add(1), *ptr.add(2), *ptr.add(3)] }; + if BE { + u32::from_be_bytes(bytes) + } else { + u32::from_le_bytes(bytes) + } +} + #[cfg_attr(not(tarpaulin), inline(always))] pub(super) fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 diff --git a/src/row/scalar/v210.rs b/src/row/scalar/v210.rs index 00a4e029..1b9db248 100644 --- a/src/row/scalar/v210.rs +++ b/src/row/scalar/v210.rs @@ -10,6 +10,15 @@ //! word 2: `[Cr1, Y3, Cb2]` //! word 3: `[Y4, Cr2, Y5]` //! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each 32-bit word in the packed stream is +//! stored in big-endian byte order. `load_endian_u32::` handles +//! the conditional byte-swap at each u32 load site inside +//! `unpack_v210_word`; the `BE = false` path is identical to the +//! previous `u32::from_le_bytes` decode. The unused branch is +//! eliminated at monomorphization. +//! //! ## Partial-word support //! //! Real captures (e.g. 720p = 1280 wide) commonly end on a partial @@ -32,14 +41,16 @@ use super::*; /// Extracts 6 Y + 3 U + 3 V 10-bit samples from one 16-byte v210 /// word. Output samples are 10-bit values in the low 10 bits of -/// each `u16`. +/// each `u16`. `BE = true` reads each 32-bit word in big-endian +/// byte order. #[cfg_attr(not(tarpaulin), inline(always))] -fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { +fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { debug_assert_eq!(word.len(), 16); - let w0 = u32::from_le_bytes([word[0], word[1], word[2], word[3]]); - let w1 = u32::from_le_bytes([word[4], word[5], word[6], word[7]]); - let w2 = u32::from_le_bytes([word[8], word[9], word[10], word[11]]); - let w3 = u32::from_le_bytes([word[12], word[13], word[14], word[15]]); + // SAFETY: word has exactly 16 bytes (checked above); each offset is ≤ 12. + let w0 = unsafe { load_endian_u32::(word.as_ptr()) }; + let w1 = unsafe { load_endian_u32::(word.as_ptr().add(4)) }; + let w2 = unsafe { load_endian_u32::(word.as_ptr().add(8)) }; + let w3 = unsafe { load_endian_u32::(word.as_ptr().add(12)) }; // Word 0: [Cb0, Y0, Cr0] let cb0 = (w0 & 0x3FF) as u16; @@ -70,14 +81,14 @@ fn unpack_v210_word(word: &[u8]) -> ([u16; 6], [u16; 3], [u16; 3]) { /// /// Supports any **even** `width`: complete 6-px words run the full /// loop; a final partial word emits 2 or 4 pixels from its valid -/// chroma-pair prefix. +/// chroma-pair prefix. `BE = true` selects big-endian u32 word decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_rgb_or_rgba_row( +pub(crate) fn v210_to_rgb_or_rgba_row( packed: &[u8], out: &mut [u8], width: usize, @@ -101,7 +112,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row( for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); // 6 pixels per word; each chroma pair (U[i], V[i]) covers // Y[2i] and Y[2i+1]. @@ -135,7 +146,7 @@ pub(crate) fn v210_to_rgb_or_rgba_row( // pairs are valid (1 pair for 2 px; 2 pairs for 4 px). let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); let pairs = tail_pixels / 2; for i in 0..pairs { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -172,14 +183,15 @@ pub(crate) fn v210_to_rgb_or_rgba_row( /// `(1 << 10) - 1 = 1023` (opaque maximum at 10-bit). /// /// Supports any **even** `width`: see [`v210_to_rgb_or_rgba_row`] -/// for partial-word semantics. +/// for partial-word semantics. `BE = true` selects big-endian u32 word +/// decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( packed: &[u8], out: &mut [u16], width: usize, @@ -204,7 +216,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); for i in 0..3 { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -232,7 +244,7 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, us, vs) = unpack_v210_word(word); + let (ys, us, vs) = unpack_v210_word::(word); let pairs = tail_pixels / 2; for i in 0..pairs { let u_d = q15_scale(us[i] as i32 - bias, c_scale); @@ -262,13 +274,14 @@ pub(crate) fn v210_to_rgb_u16_or_rgba_u16_row( /// Scalar v210 → 8-bit luma. Y values are downshifted from 10-bit /// to 8-bit via `>> 2`. Bypasses the YUV → RGB pipeline entirely. +/// `BE = true` selects big-endian u32 word decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { +pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); debug_assert!(packed.len() >= total_words * 16, "packed row too short"); @@ -279,7 +292,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); for k in 0..6 { luma_out[w * 6 + k] = (ys[k] >> 2) as u8; } @@ -287,7 +300,7 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); for k in 0..tail_pixels { luma_out[w * 6 + k] = (ys[k] >> 2) as u8; } @@ -296,14 +309,19 @@ pub(crate) fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize) /// Scalar v210 → native-depth `u16` luma (low-bit-packed). Each /// output `u16` carries the source's 10-bit Y value in its low 10 -/// bits (upper 6 bits zero). +/// bits (upper 6 bits zero). `BE = true` selects big-endian u32 word +/// decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= ceil(width / 6) * 16`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize) { +pub(crate) fn v210_to_luma_u16_row( + packed: &[u8], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(width.is_multiple_of(2), "v210 requires even width"); let total_words = width.div_ceil(6); debug_assert!(packed.len() >= total_words * 16, "packed row too short"); @@ -314,13 +332,13 @@ pub(crate) fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: u for w in 0..full_words { let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); luma_out[w * 6..w * 6 + 6].copy_from_slice(&ys); } if tail_pixels > 0 { let w = full_words; let word = &packed[w * 16..w * 16 + 16]; - let (ys, _, _) = unpack_v210_word(word); + let (ys, _, _) = unpack_v210_word::(word); luma_out[w * 6..w * 6 + tail_pixels].copy_from_slice(&ys[..tail_pixels]); } } @@ -358,12 +376,34 @@ mod tests { out } + /// Pack a v210 word using big-endian u32 encoding (each 32-bit word stored BE). + fn pack_v210_word_be(samples: [u16; 12]) -> [u8; 16] { + let mut out = [0u8; 16]; + let w0 = (samples[0] as u32 & 0x3FF) + | ((samples[1] as u32 & 0x3FF) << 10) + | ((samples[2] as u32 & 0x3FF) << 20); + let w1 = (samples[3] as u32 & 0x3FF) + | ((samples[4] as u32 & 0x3FF) << 10) + | ((samples[5] as u32 & 0x3FF) << 20); + let w2 = (samples[6] as u32 & 0x3FF) + | ((samples[7] as u32 & 0x3FF) << 10) + | ((samples[8] as u32 & 0x3FF) << 20); + let w3 = (samples[9] as u32 & 0x3FF) + | ((samples[10] as u32 & 0x3FF) << 10) + | ((samples[11] as u32 & 0x3FF) << 20); + out[0..4].copy_from_slice(&w0.to_be_bytes()); + out[4..8].copy_from_slice(&w1.to_be_bytes()); + out[8..12].copy_from_slice(&w2.to_be_bytes()); + out[12..16].copy_from_slice(&w3.to_be_bytes()); + out + } + #[test] fn scalar_v210_to_rgb_gray_is_gray() { // Full-range gray: Y=512, U=V=512 (10-bit center). let word = pack_v210_word([512; 12]); let mut rgb = [0u8; 6 * 3]; - v210_to_rgb_or_rgba_row::(&word, &mut rgb, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&word, &mut rgb, 6, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -375,7 +415,7 @@ mod tests { fn scalar_v210_to_rgba_gray_is_gray_with_opaque_alpha() { let word = pack_v210_word([512; 12]); let mut rgba = [0u8; 6 * 4]; - v210_to_rgb_or_rgba_row::(&word, &mut rgba, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&word, &mut rgba, 6, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -387,7 +427,13 @@ mod tests { // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023). let word = pack_v210_word([512; 12]); let mut rgb_u16 = [0u16; 6 * 3]; - v210_to_rgb_u16_or_rgba_u16_row::(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true); + v210_to_rgb_u16_or_rgba_u16_row::( + &word, + &mut rgb_u16, + 6, + ColorMatrix::Bt709, + true, + ); for px in rgb_u16.chunks(3) { // Gray luma at 512 / full-range produces RGB ~512 in 10-bit. assert!(px[0].abs_diff(512) <= 2); @@ -400,7 +446,13 @@ mod tests { fn scalar_v210_to_rgba_u16_alpha_is_max() { let word = pack_v210_word([512; 12]); let mut rgba_u16 = [0u16; 6 * 4]; - v210_to_rgb_u16_or_rgba_u16_row::(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true); + v210_to_rgb_u16_or_rgba_u16_row::( + &word, + &mut rgba_u16, + 6, + ColorMatrix::Bt709, + true, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1"); } @@ -413,7 +465,7 @@ mod tests { ]; let word = pack_v210_word(samples); let mut luma = [0u8; 6]; - v210_to_luma_row(&word, &mut luma, 6); + v210_to_luma_row::(&word, &mut luma, 6); // Y values: 200, 300, 400, 500, 600, 700 → 10-bit, downshift >> 2. assert_eq!(luma[0], (200u16 >> 2) as u8); assert_eq!(luma[1], (300u16 >> 2) as u8); @@ -428,7 +480,7 @@ mod tests { let samples = [100, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700]; let word = pack_v210_word(samples); let mut luma = [0u16; 6]; - v210_to_luma_u16_row(&word, &mut luma, 6); + v210_to_luma_u16_row::(&word, &mut luma, 6); assert_eq!(luma[0], 200); assert_eq!(luma[1], 300); assert_eq!(luma[2], 400); @@ -445,7 +497,7 @@ mod tests { packed.extend_from_slice(&pack_v210_word(samples)); packed.extend_from_slice(&pack_v210_word(samples)); let mut rgb = std::vec![0u8; 12 * 3]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgb, 12, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgb, 12, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); } @@ -468,19 +520,19 @@ mod tests { packed.extend_from_slice(&pack_v210_word([512; 12])); } let mut rgb = std::vec![0u8; width * 3]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgb, width, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgb, width, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1, "width={width}: gray RGB diverged"); assert_eq!(px[0], px[1]); } let mut rgba = std::vec![0u8; width * 4]; - v210_to_rgb_or_rgba_row::(&packed, &mut rgba, width, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&packed, &mut rgba, width, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); } let mut rgb_u16 = std::vec![0u16; width * 3]; - v210_to_rgb_u16_or_rgba_u16_row::( + v210_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgb_u16, width, @@ -491,12 +543,12 @@ mod tests { assert!(px[0].abs_diff(512) <= 2); } let mut luma = std::vec![0u8; width]; - v210_to_luma_row(&packed, &mut luma, width); + v210_to_luma_row::(&packed, &mut luma, width); for &y in &luma { assert_eq!(y, 128); } let mut luma_u16 = std::vec![0u16; width]; - v210_to_luma_u16_row(&packed, &mut luma_u16, width); + v210_to_luma_u16_row::(&packed, &mut luma_u16, width); for &y in &luma_u16 { assert_eq!(y, 512); } @@ -558,8 +610,81 @@ mod tests { ]; let word = pack_v210_word(samples); let mut luma = [0u8; 2]; - v210_to_luma_row(&word, &mut luma, 2); + v210_to_luma_row::(&word, &mut luma, 2); assert_eq!(luma[0], (600u16 >> 2) as u8); assert_eq!(luma[1], (700u16 >> 2) as u8); } + + // ---- BE parity tests ----------------------------------------------- + // + // For each output type: pack the same samples in BE word encoding, + // run the BE=true path, assert identical output to the LE=false path. + + #[test] + fn scalar_v210_be_rgb_matches_le() { + let samples = [ + 100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_rgb = [0u8; 6 * 3]; + let mut be_rgb = [0u8; 6 * 3]; + v210_to_rgb_or_rgba_row::(&le_word, &mut le_rgb, 6, ColorMatrix::Bt709, true); + v210_to_rgb_or_rgba_row::(&be_word, &mut be_rgb, 6, ColorMatrix::Bt709, true); + assert_eq!(le_rgb, be_rgb, "BE rgb output must match LE"); + } + + #[test] + fn scalar_v210_be_rgb_u16_matches_le() { + let samples = [ + 100u16, 512, 400, 600, 200, 300, 500, 700, 150, 450, 350, 800, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_rgb = [0u16; 6 * 3]; + let mut be_rgb = [0u16; 6 * 3]; + v210_to_rgb_u16_or_rgba_u16_row::( + &le_word, + &mut le_rgb, + 6, + ColorMatrix::Bt709, + true, + ); + v210_to_rgb_u16_or_rgba_u16_row::( + &be_word, + &mut be_rgb, + 6, + ColorMatrix::Bt709, + true, + ); + assert_eq!(le_rgb, be_rgb, "BE rgb_u16 output must match LE"); + } + + #[test] + fn scalar_v210_be_luma_matches_le() { + let samples = [ + 100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_luma = [0u8; 6]; + let mut be_luma = [0u8; 6]; + v210_to_luma_row::(&le_word, &mut le_luma, 6); + v210_to_luma_row::(&be_word, &mut be_luma, 6); + assert_eq!(le_luma, be_luma, "BE luma output must match LE"); + } + + #[test] + fn scalar_v210_be_luma_u16_matches_le() { + let samples = [ + 100u16, 200, 100, 300, 100, 400, 100, 500, 100, 600, 100, 700, + ]; + let le_word = pack_v210_word(samples); + let be_word = pack_v210_word_be(samples); + let mut le_luma = [0u16; 6]; + let mut be_luma = [0u16; 6]; + v210_to_luma_u16_row::(&le_word, &mut le_luma, 6); + v210_to_luma_u16_row::(&be_word, &mut be_luma, 6); + assert_eq!(le_luma, be_luma, "BE luma_u16 output must match LE"); + } } diff --git a/src/row/scalar/y216.rs b/src/row/scalar/y216.rs index 088ec22e..291e8914 100644 --- a/src/row/scalar/y216.rs +++ b/src/row/scalar/y216.rs @@ -6,13 +6,21 @@ //! `src/row/scalar/yuv_planar_16bit.rs`'s i64 chroma scalar //! pattern but sourced from YUYV-shaped u16 quadruples rather //! than separate Y/U/V planes. +//! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each `u16` element in `packed` is stored in +//! big-endian byte order. `load_endian_u16::` handles the +//! conditional byte-swap at each sample site; the unused branch is +//! eliminated at monomorphization. use super::*; // ---- u8 RGB / RGBA output (i32 chroma — same as Y210/Y212) ------- +/// `BE = true` selects big-endian wire decoding for each u16 sample. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_rgb_or_rgba_row( +pub(crate) fn y216_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -29,13 +37,15 @@ pub(crate) fn y216_to_rgb_or_rgba_row( let bias = chroma_bias::<16>(); let pairs = width / 2; + // SAFETY: bounds validated above; off4 + 6 < packed.len() * 2 for p < pairs. + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; + let off4 = p * 4 * 2; // No right-shift: BITS=16 means samples are already full-width. - let y0 = q[0] as i32; - let u = q[1] as i32; - let y1 = q[2] as i32; - let v = q[3] as i32; + let y0 = unsafe { load_endian_u16::(base.add(off4)) } as i32; + let u = unsafe { load_endian_u16::(base.add(off4 + 2)) } as i32; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) } as i32; + let v = unsafe { load_endian_u16::(base.add(off4 + 6)) } as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -59,8 +69,9 @@ pub(crate) fn y216_to_rgb_or_rgba_row( // ---- u16 RGB / RGBA native-depth output (i64 chroma) ---------------- +/// `BE = true` selects big-endian wire decoding for each u16 sample. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -78,12 +89,13 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( let out_max: i32 = 0xFFFF; let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = q[0] as i32; - let u = q[1] as i32; - let y1 = q[2] as i32; - let v = q[3] as i32; + let off4 = p * 4 * 2; + let y0 = unsafe { load_endian_u16::(base.add(off4)) } as i32; + let u = unsafe { load_endian_u16::(base.add(off4 + 2)) } as i32; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) } as i32; + let v = unsafe { load_endian_u16::(base.add(off4 + 6)) } as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -107,31 +119,38 @@ pub(crate) fn y216_to_rgb_u16_or_rgba_u16_row( // ---- Luma (u8) — `>> 8` ---------------------------------------------- +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) fn y216_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - out[p * 2] = (q[0] >> 8) as u8; - out[p * 2 + 1] = (q[2] >> 8) as u8; + let off4 = p * 4 * 2; + let y0 = unsafe { load_endian_u16::(base.add(off4)) }; + let y1 = unsafe { load_endian_u16::(base.add(off4 + 4)) }; + out[p * 2] = (y0 >> 8) as u8; + out[p * 2 + 1] = (y1 >> 8) as u8; } } // ---- Luma (u16, direct extract) --------------------------------------- +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { debug_assert!(width.is_multiple_of(2)); debug_assert!(packed.len() >= width * 2); debug_assert!(out.len() >= width); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - out[p * 2] = q[0]; // direct extract — full 16 bits, no shift - out[p * 2 + 1] = q[2]; + let off4 = p * 4 * 2; + // Direct extract — full 16 bits, no shift; byte-swap if BE. + out[p * 2] = unsafe { load_endian_u16::(base.add(off4)) }; + out[p * 2 + 1] = unsafe { load_endian_u16::(base.add(off4 + 4)) }; } } @@ -147,6 +166,11 @@ mod tests { [4096, 32768, 32000, 32768, 0, 16384, 65535, 49152] } + /// Byte-swap every u16 to produce the BE-encoded form. + fn to_be_u16(le: &[u16]) -> std::vec::Vec { + le.iter().map(|&v| v.swap_bytes()).collect() + } + /// u8 RGB output — hand-derived expected values for Bt709 limited range. /// /// Pair 0 (neutral chroma, U=V=32768=bias → u_d=v_d=0 → chroma=0): @@ -159,7 +183,7 @@ mod tests { fn y216_known_pattern_rgb() { let packed = test_input(); let mut out = [0u8; 4 * 3]; - y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); // Pixel 0: Y=4096 (limited-range black), neutral chroma → (0, 0, 0) assert_eq!(&out[0..3], &[0, 0, 0], "pixel 0 (Y=4096, neutral chroma)"); @@ -184,7 +208,7 @@ mod tests { fn y216_known_pattern_rgba() { let packed = test_input(); let mut out = [0u8; 4 * 4]; - y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); assert_eq!(&out[0..4], &[0, 0, 0, 0xFF]); assert_eq!(&out[4..8], &[127, 127, 127, 0xFF]); @@ -201,7 +225,13 @@ mod tests { fn y216_known_pattern_rgb_u16() { let packed = test_input(); let mut out = [0u16; 4 * 3]; - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut out, + 4, + ColorMatrix::Bt709, + false, + ); // Pixel 0: Y=4096 = limited-range floor → all channels 0 assert_eq!( @@ -226,7 +256,7 @@ mod tests { fn y216_known_pattern_rgba_u16() { let packed = test_input(); let mut out = [0u16; 4 * 4]; - y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); assert_eq!(&out[0..4], &[0, 0, 0, 0xFFFF]); assert_eq!(&out[4..8], &[32618, 32618, 32618, 0xFFFF]); @@ -242,7 +272,7 @@ mod tests { fn y216_luma_extract() { let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555]; let mut out = [0u8; 2]; - y216_to_luma_row(&packed, &mut out, 2); + y216_to_luma_row::(&packed, &mut out, 2); assert_eq!(out[0], 0xAB, "Y0 luma u8"); assert_eq!(out[1], 0xCD, "Y1 luma u8"); } @@ -253,8 +283,55 @@ mod tests { fn y216_luma_u16_extract() { let packed = [0xAB12u16, 0x4444, 0xCD34, 0x5555]; let mut out = [0u16; 2]; - y216_to_luma_u16_row(&packed, &mut out, 2); + y216_to_luma_u16_row::(&packed, &mut out, 2); assert_eq!(out[0], 0xAB12, "Y0 luma u16"); assert_eq!(out[1], 0xCD34, "Y1 luma u16"); } + + // ---- BE=true parity tests ------------------------------------------- + + /// Verify byte-swapped Y216 input + BE=true matches LE+BE=false for RGB u8. + #[test] + fn y216_be_rgb_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut out_le = [0u8; 4 * 3]; + let mut out_be = [0u8; 4 * 3]; + y216_to_rgb_or_rgba_row::(&le, &mut out_le, 4, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&be, &mut out_be, 4, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "BE and LE RGB paths must match"); + } + + #[test] + fn y216_be_rgb_u16_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut out_le = [0u16; 4 * 3]; + let mut out_be = [0u16; 4 * 3]; + y216_to_rgb_u16_or_rgba_u16_row::(&le, &mut out_le, 4, ColorMatrix::Bt709, false); + y216_to_rgb_u16_or_rgba_u16_row::(&be, &mut out_be, 4, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "BE and LE RGB u16 paths must match"); + } + + #[test] + fn y216_be_luma_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut luma_le = [0u8; 4]; + let mut luma_be = [0u8; 4]; + y216_to_luma_row::(&le, &mut luma_le, 4); + y216_to_luma_row::(&be, &mut luma_be, 4); + assert_eq!(luma_le, luma_be, "BE and LE luma paths must match"); + } + + #[test] + fn y216_be_luma_u16_matches_le() { + let le = test_input(); + let be = to_be_u16(&le); + let mut luma_le = [0u16; 4]; + let mut luma_be = [0u16; 4]; + y216_to_luma_u16_row::(&le, &mut luma_le, 4); + y216_to_luma_u16_row::(&be, &mut luma_be, 4); + assert_eq!(luma_le, luma_be, "BE and LE luma_u16 paths must match"); + } } diff --git a/src/row/scalar/y2xx.rs b/src/row/scalar/y2xx.rs index 51aa7ba0..d3c7a8b3 100644 --- a/src/row/scalar/y2xx.rs +++ b/src/row/scalar/y2xx.rs @@ -10,6 +10,15 @@ //! `BITS` (mirrors `v210.rs`'s use of `range_params_n` / //! `chroma_bias` / `q15_scale` / `q15_chroma`, just sourced from //! Y2xx's u16 packed quadruples rather than v210's 16-byte words). +//! +//! ## Big-endian wire format (`BE = true`) +//! +//! When `BE = true`, each `u16` element in `packed` is stored in +//! big-endian byte order (high byte first). The `` +//! const-generic gates `load_endian_u16::` at each sample read +//! site; on LE targets the `BE = false` path is identical to the +//! previous plain slice index. On LE hosts with `BE = false` the +//! compiler eliminates the branch entirely. use super::*; @@ -31,12 +40,14 @@ const fn rshift_bits(sample: u16) -> u16 { /// (downshifted from the native BITS Q15 pipeline via /// `range_params_n::`). /// +/// `BE = true` selects big-endian wire decoding for each u16 sample. +/// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2` (one u16 quadruple per chroma pair). /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_rgb_or_rgba_row( +pub(crate) fn y2xx_n_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -60,12 +71,15 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row( // One chroma pair (= 2 pixels) per iter. let pairs = width / 2; + // SAFETY: bounds checked by the debug_asserts above; p * 4 + 4 <= width * 2 + // because pairs = width / 2, so p < pairs means p * 4 + 4 <= width * 2. + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]) as i32; - let u = rshift_bits::(q[1]) as i32; - let y1 = rshift_bits::(q[2]) as i32; - let v = rshift_bits::(q[3]) as i32; + let off4 = p * 4 * 2; // byte offset to quadruple p (4 u16 = 8 bytes) + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }) as i32; + let u = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 2)) }) as i32; + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }) as i32; + let v = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 6)) }) as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -96,13 +110,18 @@ pub(crate) fn y2xx_n_to_rgb_or_rgba_row( /// /// `ALPHA = true` writes a 4-element-per-pixel output with α = /// `(1 << BITS) - 1` (opaque maximum at the native depth). +/// `BE = true` selects big-endian wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -127,12 +146,13 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]) as i32; - let u = rshift_bits::(q[1]) as i32; - let y1 = rshift_bits::(q[2]) as i32; - let v = rshift_bits::(q[3]) as i32; + let off4 = p * 4 * 2; + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }) as i32; + let u = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 2)) }) as i32; + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }) as i32; + let v = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 6)) }) as i32; let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -158,13 +178,14 @@ pub(crate) fn y2xx_n_to_rgb_u16_or_rgba_u16_row> (BITS - 8)`. Bypasses the YUV → RGB pipeline entirely. +/// `BE = true` selects big-endian wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_luma_row( +pub(crate) fn y2xx_n_to_luma_row( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -180,10 +201,11 @@ pub(crate) fn y2xx_n_to_luma_row( debug_assert!(luma_out.len() >= width, "luma row too short"); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - let y0 = rshift_bits::(q[0]); - let y1 = rshift_bits::(q[2]); + let off4 = p * 4 * 2; + let y0 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }); + let y1 = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }); luma_out[p * 2] = (y0 >> (BITS - 8)) as u8; luma_out[p * 2 + 1] = (y1 >> (BITS - 8)) as u8; } @@ -191,14 +213,15 @@ pub(crate) fn y2xx_n_to_luma_row( /// Y2xx → native-depth `u16` luma (low-bit-packed). Each output /// `u16` carries the source's BITS-bit Y value in its low BITS bits -/// (upper `(16 - BITS)` bits zero). +/// (upper `(16 - BITS)` bits zero). `BE = true` selects big-endian +/// wire decoding. /// /// # Panics (debug builds) /// - `width` must be even. /// - `packed.len() >= width * 2`. /// - `luma_out.len() >= width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y2xx_n_to_luma_u16_row( +pub(crate) fn y2xx_n_to_luma_u16_row( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -214,10 +237,11 @@ pub(crate) fn y2xx_n_to_luma_u16_row( debug_assert!(luma_out.len() >= width, "luma row too short"); let pairs = width / 2; + let base = packed.as_ptr().cast::(); for p in 0..pairs { - let q = &packed[p * 4..p * 4 + 4]; - luma_out[p * 2] = rshift_bits::(q[0]); - luma_out[p * 2 + 1] = rshift_bits::(q[2]); + let off4 = p * 4 * 2; + luma_out[p * 2] = rshift_bits::(unsafe { load_endian_u16::(base.add(off4)) }); + luma_out[p * 2 + 1] = rshift_bits::(unsafe { load_endian_u16::(base.add(off4 + 4)) }); } } @@ -227,39 +251,47 @@ pub(crate) fn y2xx_n_to_luma_u16_row( // BITS=12 wrappers (`y212_to_*_row`) without further kernel changes. /// Public Y210 (BITS=10) → packed RGB / RGBA u8 wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_rgb_or_rgba_row( +pub(crate) fn y210_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_or_rgba_row::<10, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y210 → packed `u16` RGB / RGBA wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y210_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y210 → 8-bit luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { - y2xx_n_to_luma_row::<10>(packed, luma_out, width); +pub(crate) fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { + y2xx_n_to_luma_row::<10, BE>(packed, luma_out, width); } /// Public Y210 → native-depth `u16` luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { - y2xx_n_to_luma_u16_row::<10>(packed, luma_out, width); +pub(crate) fn y210_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { + y2xx_n_to_luma_u16_row::<10, BE>(packed, luma_out, width); } // ---- Public Y212 (BITS=12) wrappers ------------------------------------ @@ -268,39 +300,47 @@ pub(crate) fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: // SIMD code — the per-arch backends already accept BITS ∈ {10, 12}. /// Public Y212 (BITS=12) → packed RGB / RGBA u8 wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_rgb_or_rgba_row( +pub(crate) fn y212_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_or_rgba_row::<12, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_or_rgba_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y212 → packed `u16` RGB / RGBA wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn y212_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA>(packed, out, width, matrix, full_range); + y2xx_n_to_rgb_u16_or_rgba_u16_row::<12, ALPHA, BE>(packed, out, width, matrix, full_range); } /// Public Y212 → 8-bit luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { - y2xx_n_to_luma_row::<12>(packed, luma_out, width); +pub(crate) fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { + y2xx_n_to_luma_row::<12, BE>(packed, luma_out, width); } /// Public Y212 → native-depth `u16` luma wrapper. +/// `BE = true` selects big-endian wire decoding. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { - y2xx_n_to_luma_u16_row::<12>(packed, luma_out, width); +pub(crate) fn y212_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { + y2xx_n_to_luma_u16_row::<12, BE>(packed, luma_out, width); } #[cfg(all(test, feature = "std"))] @@ -329,12 +369,17 @@ mod tests { buf } + /// Byte-swap every u16 in a slice to produce the BE-encoded form. + fn to_be_u16(le: &[u16]) -> std::vec::Vec { + le.iter().map(|&v| v.swap_bytes()).collect() + } + #[test] fn scalar_y210_to_rgb_gray_is_gray() { // Full-range gray: Y=512, U=V=512 (10-bit center) → RGB ~128. let buf = solid_y210(8, 512, 512, 512); let mut rgb = [0u8; 8 * 3]; - y210_to_rgb_or_rgba_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -346,7 +391,7 @@ mod tests { fn scalar_y210_to_rgba_alpha_is_opaque() { let buf = solid_y210(8, 512, 512, 512); let mut rgba = [0u8; 8 * 4]; - y210_to_rgb_or_rgba_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert_eq!(px[3], 0xFF); } @@ -357,7 +402,7 @@ mod tests { // Full-range gray Y=512 → ~512 in 10-bit RGB out (out_max = 1023). let buf = solid_y210(8, 512, 512, 512); let mut rgb = [0u16; 8 * 3]; - y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgb, 8, ColorMatrix::Bt709, true); for px in rgb.chunks(3) { assert!(px[0].abs_diff(512) <= 2, "px expected ~512, got {}", px[0]); assert_eq!(px[0], px[1]); @@ -369,7 +414,7 @@ mod tests { fn scalar_y210_to_rgba_u16_alpha_is_max() { let buf = solid_y210(8, 512, 512, 512); let mut rgba = [0u16; 8 * 4]; - y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&buf, &mut rgba, 8, ColorMatrix::Bt709, true); for px in rgba.chunks(4) { assert_eq!(px[3], 1023, "alpha must be (1 << 10) - 1"); } @@ -388,7 +433,7 @@ mod tests { buf[i * 4 + 3] = 128u16 << 6; // V } let mut luma = [0u8; 6]; - y210_to_luma_row(&buf, &mut luma, 6); + y210_to_luma_row::(&buf, &mut luma, 6); assert_eq!(luma[0], (100u16 >> 2) as u8); assert_eq!(luma[1], (200u16 >> 2) as u8); assert_eq!(luma[2], (300u16 >> 2) as u8); @@ -408,7 +453,7 @@ mod tests { buf[i * 4 + 3] = 128u16 << 6; } let mut luma = [0u16; 6]; - y210_to_luma_u16_row(&buf, &mut luma, 6); + y210_to_luma_u16_row::(&buf, &mut luma, 6); assert_eq!(luma[0], 100); assert_eq!(luma[1], 200); assert_eq!(luma[2], 300); @@ -416,4 +461,64 @@ mod tests { assert_eq!(luma[4], 500); assert_eq!(luma[5], 600); } + + // ---- BE=true parity tests ------------------------------------------- + + /// Verify that byte-swapped Y210 input + BE=true produces the same + /// RGB output as the native LE input + BE=false. + #[test] + fn scalar_y210_be_rgb_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut rgb_le = [0u8; 8 * 3]; + let mut rgb_be = [0u8; 8 * 3]; + y210_to_rgb_or_rgba_row::(&le, &mut rgb_le, 8, ColorMatrix::Bt709, true); + y210_to_rgb_or_rgba_row::(&be, &mut rgb_be, 8, ColorMatrix::Bt709, true); + assert_eq!( + rgb_le, rgb_be, + "BE and LE paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_rgb_u16_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut out_le = [0u16; 8 * 3]; + let mut out_be = [0u16; 8 * 3]; + y210_to_rgb_u16_or_rgba_u16_row::(&le, &mut out_le, 8, ColorMatrix::Bt709, true); + y210_to_rgb_u16_or_rgba_u16_row::(&be, &mut out_be, 8, ColorMatrix::Bt709, true); + assert_eq!( + out_le, out_be, + "BE and LE u16 paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_luma_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut luma_le = [0u8; 8]; + let mut luma_be = [0u8; 8]; + y210_to_luma_row::(&le, &mut luma_le, 8); + y210_to_luma_row::(&be, &mut luma_be, 8); + assert_eq!( + luma_le, luma_be, + "BE and LE luma paths must produce identical output" + ); + } + + #[test] + fn scalar_y210_be_luma_u16_matches_le() { + let le = solid_y210(8, 512, 512, 512); + let be = to_be_u16(&le); + let mut luma_le = [0u16; 8]; + let mut luma_be = [0u16; 8]; + y210_to_luma_u16_row::(&le, &mut luma_le, 8); + y210_to_luma_u16_row::(&be, &mut luma_be, 8); + assert_eq!( + luma_le, luma_be, + "BE and LE luma_u16 paths must produce identical output" + ); + } } diff --git a/src/sinker/mixed/v210.rs b/src/sinker/mixed/v210.rs index 42da55d3..e59a624a 100644 --- a/src/sinker/mixed/v210.rs +++ b/src/sinker/mixed/v210.rs @@ -212,6 +212,7 @@ impl PixelSink for MixedSinker<'_, V210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 10-bit Y values at native depth. @@ -221,6 +222,7 @@ impl PixelSink for MixedSinker<'_, V210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -241,6 +243,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -261,6 +264,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -291,6 +295,7 @@ impl PixelSink for MixedSinker<'_, V210> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -307,7 +312,15 @@ impl PixelSink for MixedSinker<'_, V210> { w, h, )?; - v210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + v210_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y210.rs b/src/sinker/mixed/y210.rs index cf9caaa5..430b2955 100644 --- a/src/sinker/mixed/y210.rs +++ b/src/sinker/mixed/y210.rs @@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 10-bit Y values at native depth. @@ -222,6 +223,7 @@ impl PixelSink for MixedSinker<'_, Y210> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -242,6 +244,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -262,6 +265,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -292,6 +296,7 @@ impl PixelSink for MixedSinker<'_, Y210> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -308,7 +313,15 @@ impl PixelSink for MixedSinker<'_, Y210> { w, h, )?; - y210_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y210_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y212.rs b/src/sinker/mixed/y212.rs index e7c1c959..1582e61e 100644 --- a/src/sinker/mixed/y212.rs +++ b/src/sinker/mixed/y212.rs @@ -211,6 +211,7 @@ impl PixelSink for MixedSinker<'_, Y212> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 12-bit Y values at native depth. @@ -220,6 +221,7 @@ impl PixelSink for MixedSinker<'_, Y212> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -240,6 +242,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -260,6 +263,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -290,6 +294,7 @@ impl PixelSink for MixedSinker<'_, Y212> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -306,7 +311,15 @@ impl PixelSink for MixedSinker<'_, Y212> { w, h, )?; - y212_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y212_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/y216.rs b/src/sinker/mixed/y216.rs index a8ce416d..4fdbb951 100644 --- a/src/sinker/mixed/y216.rs +++ b/src/sinker/mixed/y216.rs @@ -213,6 +213,7 @@ impl PixelSink for MixedSinker<'_, Y216> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 16-bit Y values at native depth (direct @@ -223,6 +224,7 @@ impl PixelSink for MixedSinker<'_, Y216> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -243,6 +245,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -263,6 +266,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -293,6 +297,7 @@ impl PixelSink for MixedSinker<'_, Y216> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -309,7 +314,15 @@ impl PixelSink for MixedSinker<'_, Y216> { w, h, )?; - y216_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + y216_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( From 29135613c1a807e38da58f0cb228f315e899b7fb Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 01:26:08 +1200 Subject: [PATCH 2/3] fix(be-tier4): make scalar BE conversion target-endian aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex flagged a high-severity scalar BE bug in tier 10b: the inline `if BE { x.swap_bytes() } else { x }` pattern is wrong on big-endian hosts because `swap_bytes()` is unconditional — it swaps even when the data already matches the host's byte order. The matching SIMD `load_endian_*::` helpers from PR #81 are target-endian aware (cfg-gated reverses; no-op when source order matches host order), so the buggy scalar paths diverge on s390x, corrupting both BE-input and LE-input rows when run through scalar tails or the (always-scalar) luma kernels. Audit of tier 4 scalar code confirms tier 4 was implemented from the start using the helper functions `load_endian_u16::` and `load_endian_u32::` declared in `src/row/scalar/mod.rs`, which build a fresh `[u8; N]` from the source pointer and decode via `u16::from_be_bytes` / `u16::from_le_bytes` (and the u32 pair). Those byte-array decoders are target-endian aware: each is a no-op when the data byte order matches the host CPU and a hardware byte-swap when they differ — the same semantics as `u16::from_be` / `u16::from_le` and the SIMD `load_endian_*` helpers. No `if BE { x.swap_bytes() } else { x }` pattern exists in tier 4 production scalar code (`src/row/scalar/{v210,y2xx,y216}.rs`), so no scalar production fix is needed for s390x correctness. To prevent a future regression that introduces the buggy pattern (a real risk now that the codex finding is on file across tier 5/8/10b/10-float/11), this commit upgrades the doc-comments on `load_endian_u16` and `load_endian_u32` to: - Spell out the **target-endian aware** contract (no swap on matching host order, swap on differing order). - Cite the codex finding and reference the tier 10b fix commit message for the full motivation. - Mark the inline-`swap_bytes` pattern as the "naive alternative" that the helpers exist specifically to avoid. Test helpers `to_be_u16` (`src/row/scalar/y2xx.rs`, `src/row/scalar/y216.rs`) and `pack_v210_word_be` (`src/row/scalar/v210.rs`) are intentionally left unchanged — they synthesize BE-encoded fixtures from LE inputs on the LE-host CI, mirroring the tier 5/8/10b convention; a future phase 3 s390x QEMU run will revisit them. Verified: - cargo test --target aarch64-apple-darwin --lib (2171 passed, 0 failed) - cargo build --target x86_64-apple-darwin --tests (clean, 0 warnings) - RUSTFLAGS='-C target-feature=+simd128' cargo build --target wasm32-unknown-unknown --tests - cargo build --no-default-features - cargo fmt --check - cargo clippy --all-targets --all-features -- -D warnings Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/scalar/mod.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs index 46326468..5c8cb66c 100644 --- a/src/row/scalar/mod.rs +++ b/src/row/scalar/mod.rs @@ -128,6 +128,17 @@ pub(crate) use yuv_planar_high_bit::*; /// on-wire format); `BE = true` → big-endian. The unused branch is /// eliminated by the compiler when the caller is monomorphized. /// +/// **Target-endian aware** — this matches the SIMD `load_endian_u16x*` +/// helpers' semantics: `u16::from_be_bytes` / `u16::from_le_bytes` +/// each emit a `bswap` only when the source byte order differs from +/// the host CPU's native order. On a BE host the `BE = true` branch +/// is a plain load (no swap) and the `BE = false` branch swaps; on +/// an LE host the polarity reverses. This is the strict-superset-of- +/// bugs alternative to a naive `if BE { x.swap_bytes() }` pattern, +/// which would corrupt rows on s390x / other BE hosts. See +/// `fix(be-tier10b): make scalar BE conversion target-endian aware` +/// for the codex finding that motivated this contract crate-wide. +/// /// # Safety /// /// `ptr` must point to at least 2 readable bytes. @@ -146,6 +157,12 @@ pub(super) unsafe fn load_endian_u16(ptr: *const u8) -> u16 { /// big-endian. The unused branch is eliminated by the compiler when /// the caller is monomorphized. /// +/// **Target-endian aware** — `u32::from_be_bytes` / `u32::from_le_bytes` +/// each emit a `bswap` only when the source byte order differs from +/// the host CPU's native order, matching the SIMD `load_endian_u32x*` +/// helpers. See [`load_endian_u16`] for the full target-endian +/// contract and the codex motivation. +/// /// # Safety /// /// `ptr` must point to at least 4 readable bytes. From 9ea3e431427af3d26131f9050558f75615cf7751 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 11:39:15 +1200 Subject: [PATCH 3/3] fix(be-tier4): NEON Y2xx/Y216 host-endian gate + public API LE wrappers + Y2xx Frame LE-encoded docs/checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review of PR #88 surfaced three findings; all are direct ports of patterns already established by prior tier PRs but not propagated to Tier 4 (V210 / Y210 / Y212 / Y216). 1. [high] SIMD host-endian gate wrong on BE host (`src/row/arch/{neon,x86_sse41,x86_avx2,x86_avx512,wasm_simd128}/{y2xx,y216}.rs`). The original `if !BE { simd } else { scalar }` gate covers only 2 of the 4 host x wire quadrants — LE-encoded data on a BE host (e.g. `aarch64_be`) still ran the SIMD body, whose host-native `vld2q_u16` / `_mm_loadu_si128` / `v128_load` reads decode LE bytes as host-native (BE), corrupting every sample. Replaces the gate with `if BE == HOST_NATIVE_BE { simd } else { scalar }`, mirroring PR #82 `9c7d533` / PR #85 `9e678b0` / PR #86 `b7fb9d3` / PR #87 NEON helper fixes. Truth table in the per-file constant docstring covers all 4 quadrants. Per-backend audit (8 SIMD entries gated per backend = y2xx u8 / y2xx u16 / y2xx luma / y2xx luma_u16 + y216 u8 / y216 u16 / y216 luma / y216 luma_u16): NEON (8), x86 SSE4.1 (8), x86 AVX2 (8), x86 AVX-512 (8), wasm-simd128 (8) = 40 SIMD bodies gated. v210 backends did not have this issue (the NEON v210 SIMD body uses byte-level shuffle on the raw `&[u8]` plane, not host-native u16 reads, and the existing `if !BE` gate is sufficient there) but the public dispatcher was still source-breaking and the BE-host frame path benefits from matching docs / tests. 2. [high] Public row APIs source-breaking (`src/row/dispatch/{v210,y210,y212,y216}.rs`). PR #88 added a `big_endian: bool` parameter to 24 public functions across the V210 / Y210 / Y212 / Y216 dispatchers. Existing little-endian downstream callers (`v210_to_rgb_row(..., use_simd)`) no longer compile. Each function is renamed to `foo_endian(..., big_endian: bool, ...)` and a thin LE-only wrapper `foo` is added that forwards to `foo_endian(..., big_endian = false, ...)`. Pre-existing pre-Tier 4 call sites compile unchanged; sinker code (`src/sinker/mixed/{v210, y210,y212,y216}.rs`) is updated to call the explicit `_endian` form so endian intent is visible at every internal call site (12 sites per family x 4 = 48 internal call sites updated). Mirrors the wrapper pattern from PR #87 `177a233`. 24 LE wrappers added across 4 dispatcher files. 3. [medium] Y2xx Frame validator + docs missed LE-encoded byte contract (`src/frame/y2xx.rs`). `try_new_checked` validated host-native `u16` directly. Under the LE-encoded plane contract (PR #92 `5b42065` / `3b1d716`), a valid `Y210LE` plane on a BE host has its MSB-aligned samples appear byte-swapped when read as host-native u16, so the `& low_mask` check would falsely reject every row. Updates the validator to apply `u16::from_le(sample)` before the bit check (no-op on LE host; byte-swap on BE host). Also updates the module-level + type-level + `packed()` docs to cite the FFmpeg `AV_PIX_FMT_Y210LE` / `Y212LE` / `Y216LE` names, document the LE-encoded byte contract, and recommend `bytemuck::cast_slice` + `linesize / 2` for FFmpeg byte-buffer integration. Mirrors the PR #87 `177a233` Rgb48Frame docs and PR #92 `5b42065` Rgbf32Frame pattern. V210Frame doesn't need this treatment — its plane is `&[u8]` and the wire `big_endian` flag selects MSB-vs-LSB-first interpretation of the 32-bit packed words rather than host-native byte order. Tests added: - 4 host-independent NEON BE/LE SIMD parity tests (`src/row/arch/neon/tests/{y2xx,y216,v210}.rs`): - `neon_y2xx_be_le_simd_parity_bits10` (Y210) - `neon_y2xx_be_le_simd_parity_bits12` (Y212) - `neon_y216_be_le_simd_parity` - `neon_v210_be_le_simd_parity` Each covers RGB / RGBA (where applicable) / RGB u16 / luma / luma u16 paths across widths spanning tail-only, full SIMD body, and body+tail. Built per PR #86 `6924907` pattern (raw bytes via `to_le_bytes` / `to_be_bytes` then reinterpret with `from_ne_bytes`) so byte-level fixtures are host-independent. - 2 Y2xx Frame LE-encoded contract regression tests (`src/frame/tests/y2xx.rs`): - `y210_frame_try_new_checked_accepts_le_encoded_buffer` — the new LE-aware validator must accept a properly LE-encoded plane on every host. Without the fix this fails on a BE host (false `SampleLowBitsSet` rejection). - `y210_frame_try_new_checked_rejects_be_encoded_buffer_with_low_bits` — a BE-encoded buffer of `0xFFC0` repeats must be rejected on every host (LE-decoded bytes become `0xC0FF`, low 6 bits = `0x3F`). LE-host fixture gating (mirrors PR #82 `8f2e329`): - `src/row/scalar/{y2xx,y216,v210}.rs` — 3 test mods now gated on `cfg(target_endian = "little")`. Their fixtures use host-native `u16` literals as if LE-encoded; on BE host the kernel's `from_le` byte-swap reinterprets them and the assertions fail spuriously. Kernel BE-host correctness is locked down by the new SIMD parity tests above and the existing `*_be_*_matches_le` scalar tests. - `src/row/dispatch/{v210,y210,y212,y216}.rs` — 4 dispatcher test mods gated likewise. - `src/frame/tests/y2xx.rs` — 2 individual tests (`y210_frame_try_new_checked_accepts_valid_msb_aligned_data`, `y210_frame_try_new_checked_ignores_stride_padding_bytes`) gated; the new LE-encoded-buffer test above replaces the BE-host coverage. Internal call sites updated (`use` imports + call expressions): 24 public call sites in `src/sinker/mixed/{v210,y210,y212,y216}.rs` now call `_endian` form; 2 overflow tests in `src/row/mod.rs` updated to the LE-wrapper signature. Verified: cargo test --target aarch64-apple-darwin --lib # 2319 pass (+6 new) cargo build --target x86_64-apple-darwin --tests # 0 warnings RUSTFLAGS="-C target-feature=+simd128" cargo build \ --target wasm32-unknown-unknown --tests # clean cargo build --no-default-features # clean cargo fmt --check # clean cargo clippy --all-targets --all-features -- -D warnings # clean cargo check --target s390x-unknown-linux-gnu --lib # BE-host smoke clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/tests/y2xx.rs | 55 ++++++++++++ src/frame/y2xx.rs | 59 ++++++++++-- src/row/arch/neon/tests/v210.rs | 89 ++++++++++++++++++ src/row/arch/neon/tests/y216.rs | 95 ++++++++++++++++++++ src/row/arch/neon/tests/y2xx.rs | 145 ++++++++++++++++++++++++++++++ src/row/arch/neon/y216.rs | 39 ++++++-- src/row/arch/neon/y2xx.rs | 40 +++++++-- src/row/arch/wasm_simd128/y216.rs | 23 ++++- src/row/arch/wasm_simd128/y2xx.rs | 23 ++++- src/row/arch/x86_avx2/y216.rs | 23 ++++- src/row/arch/x86_avx2/y2xx.rs | 23 ++++- src/row/arch/x86_avx512/y216.rs | 23 ++++- src/row/arch/x86_avx512/y2xx.rs | 23 ++++- src/row/arch/x86_sse41/y216.rs | 27 ++++-- src/row/arch/x86_sse41/y2xx.rs | 27 ++++-- src/row/dispatch/v210.rs | 130 +++++++++++++++++++++------ src/row/dispatch/y210.rs | 130 +++++++++++++++++++++------ src/row/dispatch/y212.rs | 130 +++++++++++++++++++++------ src/row/dispatch/y216.rs | 137 +++++++++++++++++++++------- src/row/mod.rs | 11 +-- src/row/scalar/v210.rs | 8 ++ src/row/scalar/y216.rs | 8 ++ src/row/scalar/y2xx.rs | 8 ++ src/sinker/mixed/v210.rs | 18 ++-- src/sinker/mixed/y210.rs | 18 ++-- src/sinker/mixed/y212.rs | 18 ++-- src/sinker/mixed/y216.rs | 18 ++-- 27 files changed, 1128 insertions(+), 220 deletions(-) diff --git a/src/frame/tests/y2xx.rs b/src/frame/tests/y2xx.rs index 3e457d05..6ebc6250 100644 --- a/src/frame/tests/y2xx.rs +++ b/src/frame/tests/y2xx.rs @@ -117,6 +117,13 @@ fn y210_frame_try_new_checked_rejects_low_bit_violations() { assert_eq!(err, Y2xxFrameError::SampleLowBitsSet); } +// LE-host-only fixture: builds host-native `u16` literals as if they were +// the LE-encoded byte layout. On a BE host the validator's `u16::from_le` +// byte-swap reinterprets host-native storage and the literal-vs-decoded +// byte order doesn't match the test's intent. The host-independent BE-host +// regression for this validator path lives in +// `y210_frame_try_new_checked_accepts_le_encoded_buffer` below. +#[cfg(target_endian = "little")] #[test] fn y210_frame_try_new_checked_accepts_valid_msb_aligned_data() { // All samples have low 6 bits == 0. @@ -124,6 +131,51 @@ fn y210_frame_try_new_checked_accepts_valid_msb_aligned_data() { Y210Frame::try_new_checked(&buf, 4, 1, 8).unwrap(); } +/// Host-independent regression for [`Y2xxFrame::try_new_checked`]'s LE-encoded +/// byte contract. Builds the plane explicitly from LE-encoded bytes via +/// `to_le_bytes` then reinterprets as `&[u16]` via `from_ne_bytes`. The +/// validator must accept this on both LE and BE hosts: on LE host +/// `from_le` is a no-op (host-native already matches); on BE host +/// `from_le` byte-swaps each sample back into host-native form before +/// the bit-check, recovering the intended MSB-aligned value. +/// +/// Without the LE-aware bit-check, this test would reject every sample +/// on a BE host (the byte-swapped storage has the active bits in the +/// low byte, which fails the low-`(16 - BITS)`-bits-zero check). +#[test] +fn y210_frame_try_new_checked_accepts_le_encoded_buffer() { + // Intended values: 10-bit MSB-aligned `(i << 6) & 0xFFC0` for i in 0..8. + let intended: std::vec::Vec = (0..8u16).map(|i| (i << 6) & 0xFFC0).collect(); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let buf: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + Y210Frame::try_new_checked(&buf, 4, 1, 8).unwrap(); +} + +/// Host-independent BE-host regression: a *BE-encoded* buffer of valid +/// MSB-aligned values must be rejected when fed to a Y2xx frame +/// (which assumes the LE-encoded byte contract). Pick a logical value +/// whose BE-byte form, when re-interpreted as LE, has non-zero low bits. +/// +/// Logical value `0xFFC0` BE-encoded = `[0xFF, 0xC0]`. Re-interpreted +/// via `from_le_bytes([0xFF, 0xC0])` = `0xC0FF`, whose low 6 bits = +/// `0x3F` (non-zero) → `SampleLowBitsSet`. +#[test] +fn y210_frame_try_new_checked_rejects_be_encoded_buffer_with_low_bits() { + // Use the same `0xFFC0` value across the row so we get a + // deterministic rejection regardless of host. + let intended: std::vec::Vec = std::vec![0xFFC0u16; 8]; + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let buf: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let err = Y210Frame::try_new_checked(&buf, 4, 1, 8).unwrap_err(); + assert_eq!(err, Y2xxFrameError::SampleLowBitsSet); +} + #[test] #[should_panic(expected = "invalid Y2xxFrame:")] fn y210_frame_new_panics_on_invalid() { @@ -131,6 +183,9 @@ fn y210_frame_new_panics_on_invalid() { let _ = Y210Frame::new(&buf, 0, 0, 0); } +// LE-host-only fixture: builds host-native `u16` literals as if they were +// the LE-encoded byte layout. See note above on the LE-encoded contract. +#[cfg(target_endian = "little")] #[test] fn y210_frame_try_new_checked_ignores_stride_padding_bytes() { // Width=4 → row_elems = 8 u16; stride = 12 u16 (4 u16 padding per row). diff --git a/src/frame/y2xx.rs b/src/frame/y2xx.rs index 19dbae9a..77ccd855 100644 --- a/src/frame/y2xx.rs +++ b/src/frame/y2xx.rs @@ -5,14 +5,30 @@ //! u16 quadruples (`Y₀, U, Y₁, V`). Active bits are MSB-aligned; //! low `(16 - BITS)` bits are zero. //! -//! | Format | BITS | Active bit width | Low bits | -//! |--------|------|------------------|----------| -//! | Y210 | 10 | bits[15:6] | bits[5:0] = 0 | -//! | Y212 | 12 | bits[15:4] | bits[3:0] = 0 | -//! | Y216 | 16 | bits[15:0] | n/a (full range) | +//! | Format | BITS | FFmpeg pix_fmt | Active bit width | Low bits | +//! |--------|------|--------------------------|------------------|----------| +//! | Y210 | 10 | `AV_PIX_FMT_Y210LE` | bits[15:6] | bits[5:0] = 0 | +//! | Y212 | 12 | `AV_PIX_FMT_Y212LE` | bits[15:4] | bits[3:0] = 0 | +//! | Y216 | 16 | `AV_PIX_FMT_Y216LE` | bits[15:0] | n/a (full range) | //! //! Width must be even (4:2:2 chroma subsampling). //! +//! # Endian contract — **LE-encoded bytes** +//! +//! The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted as +//! `u16`, matching the FFmpeg `*LE` pixel-format suffix in each format's +//! name. On a little-endian host (every CI runner today) LE bytes _are_ +//! host-native, so `&[u16]` is also a host-native u16 slice; on a +//! big-endian host the bytes have to be byte-swapped back to host-native +//! before arithmetic. Downstream row kernels handle this byte-swap (or +//! no-op on LE) under the hood — callers do **not** pre-swap. +//! +//! Stride is in **u16 elements** (not bytes). Callers holding a raw +//! FFmpeg byte buffer should cast via `bytemuck::cast_slice` (which +//! checks alignment at runtime) and divide `linesize[0]` by 2 before +//! constructing. Direct pointer casts to `&[u16]` are undefined behaviour +//! if the byte buffer is not 2-byte aligned. +//! //! Used by Ship 11b (Y210), Ship 11c (Y212 — wiring-only), and //! Ship 11d (Y216 — separate kernel family with i64 chroma path). @@ -20,7 +36,8 @@ use derive_more::IsVariant; use thiserror::Error; /// Validated wrapper around a packed YUV 4:2:2 high-bit-depth plane -/// for the `Y210` / `Y212` / `Y216` family. +/// for the `Y210` / `Y212` / `Y216` family +/// (`AV_PIX_FMT_Y210LE` / `Y212LE` / `Y216LE`). /// /// `BITS` selects the active sample width: 10, 12, or 16. Construct /// via [`Self::try_new`] (fallible) or [`Self::new`] (panics on @@ -28,6 +45,17 @@ use thiserror::Error; /// [`Self::try_new_checked`] additionally verifies that every /// sample's low `(16 - BITS)` bits are zero (matches the /// `P010::try_new_checked` pattern). +/// +/// The `&[u16]` plane is the **LE-encoded byte layout** reinterpreted +/// as `u16`, matching the FFmpeg `*LE` pixel-format convention. On a +/// little-endian host (every CI runner today) LE bytes _are_ +/// host-native, so `&[u16]` is also a host-native u16 slice; on a +/// big-endian host the bytes have to be byte-swapped back to +/// host-native before arithmetic. Downstream row kernels handle the +/// byte-swap under the hood — callers do **not** pre-swap. Callers +/// holding raw FFmpeg byte buffers should cast via +/// `bytemuck::cast_slice` and divide `linesize[0]` by 2 before +/// constructing. #[derive(Debug, Clone, Copy)] pub struct Y2xxFrame<'a, const BITS: u32> { packed: &'a [u16], @@ -167,6 +195,16 @@ impl<'a, const BITS: u32> Y2xxFrame<'a, BITS> { /// low `(16 - BITS)` bits are non-zero. Only meaningful for /// `BITS ∈ {10, 12}`; for `BITS = 16` this delegates to /// [`Self::try_new`] (no low bits to check). + /// + /// Per the LE-encoded byte contract on the type-level docs, samples + /// are validated **after** `u16::from_le` normalization so the bit + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form. Without this normalization a + /// valid `Y210LE` plane on a BE host would have its MSB-aligned + /// samples appear byte-swapped (low bits set in the host-native + /// reading) and the validator would falsely reject every row. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( packed: &'a [u16], @@ -183,7 +221,9 @@ impl<'a, const BITS: u32> Y2xxFrame<'a, BITS> { for row in 0..h { let start = row * stride_us; for &sample in &packed[start..start + row_elems] { - if sample & low_mask != 0 { + // Normalize from LE-encoded wire to host-native before the + // bit check (no-op on LE host, byte-swap on BE host). + if u16::from_le(sample) & low_mask != 0 { return Err(Y2xxFrameError::SampleLowBitsSet); } } @@ -221,7 +261,10 @@ impl<'a, const BITS: u32> Y2xxFrame<'a, BITS> { /// Packed plane: `(Y₀, U, Y₁, V)` u16 quadruples — `width × 2` /// u16 elements per row (= `width × 4` bytes). 4:2:2 chroma is /// shared between each Y pair; samples are MSB-aligned with the - /// low `(16 - BITS)` bits zero. + /// low `(16 - BITS)` bits zero (`BITS ∈ {10, 12}`). + /// + /// The slice carries the **LE-encoded byte layout** reinterpreted + /// as `u16` (FFmpeg `*LE` convention) — see the type-level docs. #[cfg_attr(not(tarpaulin), inline(always))] pub const fn packed(&self) -> &'a [u16] { self.packed diff --git a/src/row/arch/neon/tests/v210.rs b/src/row/arch/neon/tests/v210.rs index be979537..a0bbe08c 100644 --- a/src/row/arch/neon/tests/v210.rs +++ b/src/row/arch/neon/tests/v210.rs @@ -242,3 +242,92 @@ fn neon_v210_lane_order_per_pixel_y_and_u() { "neon v210 SIMD vs scalar diverges — chroma deinterleave bug" ); } + +// ---- Host-independent BE/LE SIMD parity tests ---------------------------- +// +// v210 packs three 10-bit samples per 32-bit word; the LE/BE wire forms +// differ by the byte order of each 32-bit word. We materialize both +// forms from raw bytes via `to_le_bytes` / `to_be_bytes` so the test +// is host-independent (mirrors PR #86 `6924907` for V410). Locks down +// the `BE == HOST_NATIVE_BE` host-endian gate fix on the NEON v210 SIMD +// body. + +fn build_le_be_v210(words: usize, seed: usize) -> (std::vec::Vec, std::vec::Vec) { + // Build the intended u32 words first, then materialize LE-byte and + // BE-byte forms explicitly. + let mut intended_words: std::vec::Vec = std::vec::Vec::with_capacity(words * 4); + for w in 0..words { + for k in 0..4 { + let s0 = ((w * 4 + k) * 37 + seed) & 0x3FF; + let s1 = ((w * 4 + k) * 53 + seed * 3) & 0x3FF; + let s2 = ((w * 4 + k) * 71 + seed * 7) & 0x3FF; + intended_words.push((s0 as u32) | ((s1 as u32) << 10) | ((s2 as u32) << 20)); + } + } + let le_bytes: std::vec::Vec = intended_words + .iter() + .flat_map(|w| w.to_le_bytes()) + .collect(); + let be_bytes: std::vec::Vec = intended_words + .iter() + .flat_map(|w| w.to_be_bytes()) + .collect(); + (le_bytes, be_bytes) +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_v210_be_le_simd_parity() { + for w in [6usize, 12, 18, 24, 36, 1920] { + let words = w.div_ceil(6); + let (le, be) = build_le_be_v210(words, 0xCAFE); + + // u8 RGB + let mut le_rgb = std::vec![0u8; w * 3]; + let mut be_rgb = std::vec![0u8; w * 3]; + unsafe { + v210_to_rgb_or_rgba_row::(&le, &mut le_rgb, w, ColorMatrix::Bt709, false); + v210_to_rgb_or_rgba_row::(&be, &mut be_rgb, w, ColorMatrix::Bt709, false); + } + assert_eq!(le_rgb, be_rgb, "v210 NEON LE vs BE RGB parity (w={w})"); + + // u16 RGB + let mut le_u16 = std::vec![0u16; w * 3]; + let mut be_u16 = std::vec![0u16; w * 3]; + unsafe { + v210_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut le_u16, + w, + ColorMatrix::Bt709, + false, + ); + v210_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut be_u16, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!(le_u16, be_u16, "v210 NEON LE vs BE RGB u16 parity (w={w})"); + + // luma u8 + let mut le_l = std::vec![0u8; w]; + let mut be_l = std::vec![0u8; w]; + unsafe { + v210_to_luma_row::(&le, &mut le_l, w); + v210_to_luma_row::(&be, &mut be_l, w); + } + assert_eq!(le_l, be_l, "v210 NEON LE vs BE luma u8 parity (w={w})"); + + // luma u16 + let mut le_lu = std::vec![0u16; w]; + let mut be_lu = std::vec![0u16; w]; + unsafe { + v210_to_luma_u16_row::(&le, &mut le_lu, w); + v210_to_luma_u16_row::(&be, &mut be_lu, w); + } + assert_eq!(le_lu, be_lu, "v210 NEON LE vs BE luma u16 parity (w={w})"); + } +} diff --git a/src/row/arch/neon/tests/y216.rs b/src/row/arch/neon/tests/y216.rs index 1a982f4d..26fc04c1 100644 --- a/src/row/arch/neon/tests/y216.rs +++ b/src/row/arch/neon/tests/y216.rs @@ -171,3 +171,98 @@ fn neon_y216_lane_order_per_pixel_y_and_u() { "NEON y216 SIMD vs scalar diverges (u16 RGB, i64 chroma)" ); } + +// ---- Host-independent BE/LE SIMD parity tests ---------------------------- +// +// Built per PR #86 `6924907` pattern: construct LE/BE buffers from raw +// bytes via `to_le_bytes` / `to_be_bytes` and reinterpret as host-native +// `u16` via `from_ne_bytes`. The byte-level encoding is host-independent — +// on every host the LE buffer carries the intended values as LE-encoded +// bytes and the BE buffer carries the same values as BE-encoded bytes — +// so both kernel monomorphizations decode to the same logical values and +// produce byte-identical output on both LE and BE hosts. Locks down the +// `BE == HOST_NATIVE_BE` host-endian gate fix applied to the NEON Y216 +// SIMD bodies (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` / +// PR #86 `b7fb9d3`). + +fn build_le_be_y216(width: usize, seed: usize) -> (std::vec::Vec, std::vec::Vec) { + let intended = pseudo_random_y216(width, seed); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_y216_be_le_simd_parity() { + // Widths covering tail-only (< 16), full SIMD body (16 px), and + // body+tail to exercise both code paths on every host. + for w in [8usize, 14, 16, 22, 32, 1920] { + let (le, be) = build_le_be_y216(w, 0xBEEF); + + // u8 RGB + let mut le_rgb = std::vec![0u8; w * 3]; + let mut be_rgb = std::vec![0u8; w * 3]; + unsafe { + y216_to_rgb_or_rgba_row::(&le, &mut le_rgb, w, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&be, &mut be_rgb, w, ColorMatrix::Bt709, false); + } + assert_eq!(le_rgb, be_rgb, "y216 NEON LE vs BE RGB parity (w={w})"); + + // u8 RGBA + let mut le_rgba = std::vec![0u8; w * 4]; + let mut be_rgba = std::vec![0u8; w * 4]; + unsafe { + y216_to_rgb_or_rgba_row::(&le, &mut le_rgba, w, ColorMatrix::Bt709, false); + y216_to_rgb_or_rgba_row::(&be, &mut be_rgba, w, ColorMatrix::Bt709, false); + } + assert_eq!(le_rgba, be_rgba, "y216 NEON LE vs BE RGBA parity (w={w})"); + + // u16 RGB (i64-chroma path) + let mut le_u16 = std::vec![0u16; w * 3]; + let mut be_u16 = std::vec![0u16; w * 3]; + unsafe { + y216_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut le_u16, + w, + ColorMatrix::Bt2020Ncl, + true, + ); + y216_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut be_u16, + w, + ColorMatrix::Bt2020Ncl, + true, + ); + } + assert_eq!(le_u16, be_u16, "y216 NEON LE vs BE RGB u16 parity (w={w})"); + + // luma u8 + let mut le_l = std::vec![0u8; w]; + let mut be_l = std::vec![0u8; w]; + unsafe { + y216_to_luma_row::(&le, &mut le_l, w); + y216_to_luma_row::(&be, &mut be_l, w); + } + assert_eq!(le_l, be_l, "y216 NEON LE vs BE luma u8 parity (w={w})"); + + // luma u16 + let mut le_lu = std::vec![0u16; w]; + let mut be_lu = std::vec![0u16; w]; + unsafe { + y216_to_luma_u16_row::(&le, &mut le_lu, w); + y216_to_luma_u16_row::(&be, &mut be_lu, w); + } + assert_eq!(le_lu, be_lu, "y216 NEON LE vs BE luma u16 parity (w={w})"); + } +} diff --git a/src/row/arch/neon/tests/y2xx.rs b/src/row/arch/neon/tests/y2xx.rs index d12a51d4..96f24f82 100644 --- a/src/row/arch/neon/tests/y2xx.rs +++ b/src/row/arch/neon/tests/y2xx.rs @@ -272,3 +272,148 @@ fn neon_y212_matches_scalar_widths() { assert_eq!(slu, klu, "NEON y2xx<12>→luma u16 diverges (width={w})"); } } + +// ---- Host-independent BE/LE SIMD parity tests ---------------------------- +// +// Built per PR #86 `6924907` pattern: construct LE/BE buffers from raw +// bytes via `to_le_bytes` / `to_be_bytes` and reinterpret as host-native +// `u16` via `from_ne_bytes`. The byte-level encoding is then host- +// independent — on every host the LE buffer carries the intended values +// as LE-encoded bytes and the BE buffer carries the same values as +// BE-encoded bytes — so both kernel monomorphizations decode to the +// same logical values and produce byte-identical output on both LE and +// BE hosts. Locks down the `BE == HOST_NATIVE_BE` host-endian gate fix +// applied to the NEON Y2xx SIMD bodies (mirrors PR #82 `9c7d533` / +// PR #85 `9e678b0` / PR #86 `b7fb9d3`). + +/// Builds intended Y2xx-shaped values then materializes both LE-encoded +/// and BE-encoded `&[u16]` planes from raw bytes (host-independent). +fn build_le_be_y2xx( + width: usize, + seed: usize, +) -> (std::vec::Vec, std::vec::Vec) { + let shift = 16 - BITS; + let mask: u16 = (1u16 << BITS) - 1; + let intended: std::vec::Vec = (0..width * 2) + .map(|i| { + let s = ((i.wrapping_mul(seed).wrapping_add(seed * 3)) & mask as usize) as u16; + s << shift + }) + .collect(); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_y2xx_be_le_simd_parity_bits10() { + // Widths covering the SIMD body (8 px), tail-only (< 8), and + // body+tail (8 + tail) so both code paths are exercised on each host. + for w in [4usize, 8, 14, 16, 22, 32, 1920] { + let (le, be) = build_le_be_y2xx::<10>(w, 0xBEEF); + // u8 RGB + let mut le_rgb = std::vec![0u8; w * 3]; + let mut be_rgb = std::vec![0u8; w * 3]; + unsafe { + y2xx_n_to_rgb_or_rgba_row::<10, false, false>(&le, &mut le_rgb, w, ColorMatrix::Bt709, false); + y2xx_n_to_rgb_or_rgba_row::<10, false, true>(&be, &mut be_rgb, w, ColorMatrix::Bt709, false); + } + assert_eq!(le_rgb, be_rgb, "y2xx<10> NEON LE vs BE RGB parity (w={w})"); + + // u16 RGB + let mut le_u16 = std::vec![0u16; w * 3]; + let mut be_u16 = std::vec![0u16; w * 3]; + unsafe { + y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, false>( + &le, + &mut le_u16, + w, + ColorMatrix::Bt709, + false, + ); + y2xx_n_to_rgb_u16_or_rgba_u16_row::<10, false, true>( + &be, + &mut be_u16, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + le_u16, be_u16, + "y2xx<10> NEON LE vs BE RGB u16 parity (w={w})" + ); + + // luma u8 + let mut le_l = std::vec![0u8; w]; + let mut be_l = std::vec![0u8; w]; + unsafe { + y2xx_n_to_luma_row::<10, false>(&le, &mut le_l, w); + y2xx_n_to_luma_row::<10, true>(&be, &mut be_l, w); + } + assert_eq!(le_l, be_l, "y2xx<10> NEON LE vs BE luma u8 parity (w={w})"); + + // luma u16 + let mut le_lu = std::vec![0u16; w]; + let mut be_lu = std::vec![0u16; w]; + unsafe { + y2xx_n_to_luma_u16_row::<10, false>(&le, &mut le_lu, w); + y2xx_n_to_luma_u16_row::<10, true>(&be, &mut be_lu, w); + } + assert_eq!( + le_lu, be_lu, + "y2xx<10> NEON LE vs BE luma u16 parity (w={w})" + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_y2xx_be_le_simd_parity_bits12() { + for w in [4usize, 8, 14, 16, 22, 32, 1920] { + let (le, be) = build_le_be_y2xx::<12>(w, 0xC0DE); + + let mut le_rgba = std::vec![0u8; w * 4]; + let mut be_rgba = std::vec![0u8; w * 4]; + unsafe { + y2xx_n_to_rgb_or_rgba_row::<12, true, false>( + &le, + &mut le_rgba, + w, + ColorMatrix::Bt2020Ncl, + true, + ); + y2xx_n_to_rgb_or_rgba_row::<12, true, true>( + &be, + &mut be_rgba, + w, + ColorMatrix::Bt2020Ncl, + true, + ); + } + assert_eq!( + le_rgba, be_rgba, + "y2xx<12> NEON LE vs BE RGBA parity (w={w})" + ); + + let mut le_lu = std::vec![0u16; w]; + let mut be_lu = std::vec![0u16; w]; + unsafe { + y2xx_n_to_luma_u16_row::<12, false>(&le, &mut le_lu, w); + y2xx_n_to_luma_u16_row::<12, true>(&be, &mut be_lu, w); + } + assert_eq!( + le_lu, be_lu, + "y2xx<12> NEON LE vs BE luma u16 parity (w={w})" + ); + } +} diff --git a/src/row/arch/neon/y216.rs b/src/row/arch/neon/y216.rs index 01a26e62..316f4267 100644 --- a/src/row/arch/neon/y216.rs +++ b/src/row/arch/neon/y216.rs @@ -29,6 +29,21 @@ use core::arch::aarch64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y216 SIMD bodies. +/// +/// `vld2q_u16` deinterleaves using **host-native** u16 reads, so the SIMD +/// body is only correct when the encoded byte order matches the host. The +/// truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` / PR #86 +/// `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- u8 output (i32 chroma, 16 px/iter) --------------------------------- /// NEON Y216 → packed u8 RGB or RGBA. @@ -62,9 +77,11 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( const RND: i32 = 1 << 14; unsafe { - // BE=true: bypass NEON; scalar handles full row below. + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row + // via `from_le` / `from_be`. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = vdupq_n_s32(RND); // For the u8 output path: `scale_y_u16_to_i16` takes i32x4 y_off. // Y values are full u16 (0..65535), so we must use u16-aware widening @@ -179,7 +196,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( x += 16; } - } // end if !BE + } // end if BE == HOST_NATIVE_BE // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { @@ -240,9 +257,11 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( debug_assert!(out.len() >= width); unsafe { + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { while x + 16 <= width { // Two vld2q_u16: pair.0 = 8 Y lanes each; chroma discarded. let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); @@ -500,8 +521,10 @@ pub(crate) unsafe fn y216_to_luma_u16_row( debug_assert!(out.len() >= width); unsafe { + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { while x + 16 <= width { let pair_lo = vld2q_u16(packed.as_ptr().add(x * 2)); let pair_hi = vld2q_u16(packed.as_ptr().add(x * 2 + 16)); diff --git a/src/row/arch/neon/y2xx.rs b/src/row/arch/neon/y2xx.rs index 72920362..434ff40d 100644 --- a/src/row/arch/neon/y2xx.rs +++ b/src/row/arch/neon/y2xx.rs @@ -36,6 +36,26 @@ use core::arch::aarch64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx SIMD bodies. +/// +/// `vld2q_u16` deinterleaves using **host-native** u16 reads, so the SIMD +/// body is only correct when the encoded byte order matches the host. The +/// truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` / PR #86 +/// `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +/// +/// The previous `if !BE` gate only covered rows 1+3 (LE-host) and would +/// run the SIMD body for LE-encoded data on a BE host (e.g. `aarch64_be`), +/// where `vld2q_u16` reads LE bytes as host-native (BE) and corrupts every +/// sample. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Loads 8 Y2xx pixels (16 u16 samples = 32 bytes) and unpacks them /// into three 8‑lane vectors: /// - `y_vec`: lanes 0..8 = Y0..Y7 in `[0, 2^BITS - 1]`. @@ -131,9 +151,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< // by the `while x + 8 <= width` loop and the caller-promised slice // lengths checked above. unsafe { - // BE=true: NEON path skipped; scalar handles all pixels below. + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row + // via `from_le` / `from_be`. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = vdupq_n_s32(RND); let y_off_v = vdupq_n_s16(y_off as i16); let y_scale_v = vdupq_n_s32(y_scale); @@ -271,9 +293,11 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< // SAFETY: caller's obligation per the safety contract above. unsafe { - // BE=true: bypass NEON; scalar handles full row below. + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row + // via `from_le` / `from_be`. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = vdupq_n_s32(RND); let y_off_v = vdupq_n_s16(y_off as i16); let y_scale_v = vdupq_n_s32(y_scale); @@ -381,8 +405,10 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { while x + 8 <= width { // `vld2q_u16` deinterleaves; `pair.0` is 8 raw Y u16 samples // (still MSB‑aligned at BITS ≤ 12, low bits zero). @@ -434,8 +460,10 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { + // SIMD body runs only when the wire byte order matches the host + // (`BE == HOST_NATIVE_BE`); otherwise scalar handles the full row. let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let shr_count = vdupq_n_s16(-((16 - BITS) as i16)); while x + 8 <= width { let pair = vld2q_u16(packed.as_ptr().add(x * 2)); diff --git a/src/row/arch/wasm_simd128/y216.rs b/src/row/arch/wasm_simd128/y216.rs index 7bdf6363..f833b4e7 100644 --- a/src/row/arch/wasm_simd128/y216.rs +++ b/src/row/arch/wasm_simd128/y216.rs @@ -46,6 +46,21 @@ use core::arch::wasm32::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Loads 8 Y216 pixels (16 u16 samples = 32 bytes) and extracts /// the Y, U, V vectors **without any right-shift** (BITS=16, already /// full-range). Returns `(y_vec, u_vec, v_vec)` where: @@ -125,7 +140,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = i32x4_splat(RND); let y_off32_v = i32x4_splat(y_off); let y_scale_v = i32x4_splat(y_scale); @@ -265,7 +280,7 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Y permute: even u16 lanes → low 8 bytes; zeroed high. let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); @@ -463,7 +478,7 @@ pub(crate) unsafe fn y216_to_luma_u16_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); // 16 px/iter: two groups of 8 Y samples (u16 direct copy, no shift). diff --git a/src/row/arch/wasm_simd128/y2xx.rs b/src/row/arch/wasm_simd128/y2xx.rs index 83c4a6eb..c442d5c4 100644 --- a/src/row/arch/wasm_simd128/y2xx.rs +++ b/src/row/arch/wasm_simd128/y2xx.rs @@ -52,6 +52,21 @@ use core::arch::wasm32::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Loads 8 Y2xx pixels (16 u16 samples = 32 bytes) and unpacks them /// into three `v128` vectors holding `BITS`-bit samples in their low /// bits (each lane an i16): @@ -170,7 +185,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< // caller-promised slice lengths checked above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = i32x4_splat(RND); let y_off_v = i16x8_splat(y_off as i16); let y_scale_v = i32x4_splat(y_scale); @@ -334,7 +349,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = i32x4_splat(RND); let y_off_v = i16x8_splat(y_off as i16); let y_scale_v = i32x4_splat(y_scale); @@ -446,7 +461,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Y permute mask: pick even u16 lanes (low byte at [0], high byte // at [1]) into the low 8 bytes; high 8 bytes zeroed. let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); @@ -517,7 +532,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let shr_count: u32 = 16 - BITS; let y_idx = i8x16(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); diff --git a/src/row/arch/x86_avx2/y216.rs b/src/row/arch/x86_avx2/y216.rs index cf850e18..5635b5bb 100644 --- a/src/row/arch/x86_avx2/y216.rs +++ b/src/row/arch/x86_avx2/y216.rs @@ -31,6 +31,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Deinterleave helper (shared by u8 and u16 paths) ------------------- /// Deinterleaves 16 YUYV u16 quadruples (= 32 u16 = 64 bytes) loaded @@ -129,7 +144,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm256_set1_epi32(RND); // y_off as i32 — scale_y_u16_avx2 takes i32x8 y_off. let y_off_v = _mm256_set1_epi32(y_off); @@ -308,7 +323,7 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Per-lane Y permute mask: pick even u16 lanes (low byte first) into // the low 8 bytes of each 128-bit lane; high 8 bytes zeroed. let split_idx = _mm256_setr_epi8( @@ -557,7 +572,7 @@ pub(crate) unsafe fn y216_to_luma_u16_row( // SAFETY: AVX2 availability is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Per-lane Y permute mask (same as luma_row above). let split_idx = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, diff --git a/src/row/arch/x86_avx2/y2xx.rs b/src/row/arch/x86_avx2/y2xx.rs index bc3c5bb1..ff057261 100644 --- a/src/row/arch/x86_avx2/y2xx.rs +++ b/src/row/arch/x86_avx2/y2xx.rs @@ -51,6 +51,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Loads 16 Y2xx pixels (32 u16 samples = 64 bytes) and unpacks them /// into three `__m256i` vectors holding `BITS`-bit samples in their /// low bits (each lane an i16): @@ -197,7 +212,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< // lengths checked above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm256_set1_epi32(RND); let y_off_v = _mm256_set1_epi16(y_off as i16); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -372,7 +387,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm256_set1_epi32(RND); let y_off_v = _mm256_set1_epi16(y_off as i16); let y_scale_v = _mm256_set1_epi32(y_scale); @@ -522,7 +537,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Per-lane Y permute mask: pick even u16 lanes (low byte at [0], // high byte at [1]) into the low 8 bytes; high 8 bytes zeroed. let split_idx = _mm256_setr_epi8( @@ -602,7 +617,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let split_idx = _mm256_setr_epi8( 0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1, // low lane diff --git a/src/row/arch/x86_avx512/y216.rs b/src/row/arch/x86_avx512/y216.rs index be564433..fa61f89c 100644 --- a/src/row/arch/x86_avx512/y216.rs +++ b/src/row/arch/x86_avx512/y216.rs @@ -41,6 +41,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Static permute index tables (same layout as y2xx.rs) --------------- #[rustfmt::skip] @@ -138,7 +153,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm512_set1_epi32(RND); let y_off_v = _mm512_set1_epi32(y_off); let y_scale_v = _mm512_set1_epi32(y_scale); @@ -331,7 +346,7 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); @@ -553,7 +568,7 @@ pub(crate) unsafe fn y216_to_luma_u16_row( // SAFETY: AVX-512F + AVX-512BW is the caller's obligation. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); while x + 64 <= width { diff --git a/src/row/arch/x86_avx512/y2xx.rs b/src/row/arch/x86_avx512/y2xx.rs index 1d2b1dcd..5c1d6c19 100644 --- a/src/row/arch/x86_avx512/y2xx.rs +++ b/src/row/arch/x86_avx512/y2xx.rs @@ -65,6 +65,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- Static permute index tables -------------------------------------- // // Each table is a 32-lane u16 permute index. For @@ -210,7 +225,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< // caller-promised slice lengths checked above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm512_set1_epi32(RND); let y_off_v = _mm512_set1_epi16(y_off as i16); let y_scale_v = _mm512_set1_epi32(y_scale); @@ -395,7 +410,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm512_set1_epi32(RND); let y_off_v = _mm512_set1_epi16(y_off as i16); let y_scale_v = _mm512_set1_epi32(y_scale); @@ -519,7 +534,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let pack_fixup = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7); let zero = _mm512_setzero_si512(); let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); @@ -587,7 +602,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let y_idx = _mm512_loadu_si512(Y_FROM_YUYV_IDX.as_ptr().cast()); diff --git a/src/row/arch/x86_sse41/y216.rs b/src/row/arch/x86_sse41/y216.rs index e799caee..308bb1e2 100644 --- a/src/row/arch/x86_sse41/y216.rs +++ b/src/row/arch/x86_sse41/y216.rs @@ -34,6 +34,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + // ---- u8 output (i32 chroma, 16 px/iter) --------------------------------- /// SSE4.1 Y216 → packed u8 RGB or RGBA. @@ -66,7 +81,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm_set1_epi32(RND); // Y216 samples are full u16 [0..65535]; use i32 y_off and // scale_y_u16 (unsigned widening) to avoid sign-bit corruption for Y > 32767. @@ -213,7 +228,7 @@ pub(crate) unsafe fn y216_to_rgb_or_rgba_row( x += 16; } - } // end if !BE + } // end if BE == HOST_NATIVE_BE // Scalar tail — remaining < 16 pixels, or full-row fallback when BE=true. if x < width { @@ -267,7 +282,7 @@ pub(crate) unsafe fn y216_to_rgb_u16_or_rgba_u16_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Pick even u16 lanes (Y samples) into low 8 bytes, zero high bytes. let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); @@ -511,7 +526,7 @@ pub(crate) unsafe fn y216_to_luma_u16_row( unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); while x + 16 <= width { diff --git a/src/row/arch/x86_sse41/y2xx.rs b/src/row/arch/x86_sse41/y2xx.rs index e8e18aff..d7e36434 100644 --- a/src/row/arch/x86_sse41/y2xx.rs +++ b/src/row/arch/x86_sse41/y2xx.rs @@ -46,6 +46,21 @@ use core::arch::x86_64::*; use super::*; use crate::{ColorMatrix, row::scalar}; +/// Host-endian gate for Y2xx/Y216 SIMD bodies. +/// +/// SIMD deinterleave / fixed-shuffle paths use **host-native** u16 reads, +/// so the SIMD body is only correct when the encoded byte order matches +/// the host. The truth table (mirrors PR #82 `9c7d533` / PR #85 `9e678b0` +/// / PR #86 `b7fb9d3` host-endian gate fixes): +/// +/// | wire `BE` | host | `BE == HOST_NATIVE_BE` | path | correct via | +/// |-----------|------------|------------------------|--------|----------------| +/// | false | LE | true | SIMD | host-native LE | +/// | false | BE | false | scalar | `from_le` | +/// | true | LE | false | scalar | `from_be` | +/// | true | BE | true | SIMD | host-native BE | +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + /// Loads 8 Y2xx pixels (16 u16 samples = 32 bytes) and unpacks them /// into three `__m128i` vectors holding `BITS`-bit samples in their /// low bits (each lane an i16): @@ -163,7 +178,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< // lengths checked above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm_set1_epi32(RND); let y_off_v = _mm_set1_epi16(y_off as i16); let y_scale_v = _mm_set1_epi32(y_scale); @@ -261,7 +276,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_or_rgba_row< x += 8; } - } // end if !BE + } // end if BE == HOST_NATIVE_BE // Scalar tail — remaining < 8 pixels (always even per 4:2:2), // or full-row fallback when BE=true. @@ -326,7 +341,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let rnd_v = _mm_set1_epi32(RND); let y_off_v = _mm_set1_epi16(y_off as i16); let y_scale_v = _mm_set1_epi32(y_scale); @@ -386,7 +401,7 @@ pub(crate) unsafe fn y2xx_n_to_rgb_u16_or_rgba_u16_row< x += 8; } - } // end if !BE + } // end if BE == HOST_NATIVE_BE if x < width { let tail_packed = &packed[x * 2..width * 2]; @@ -436,7 +451,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { // Y permute mask: pick even u16 lanes (low byte at [0], high byte // at [1]) into the low 8 bytes; high 8 bytes zeroed. let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); @@ -504,7 +519,7 @@ pub(crate) unsafe fn y2xx_n_to_luma_u16_row( // SAFETY: caller's obligation per the safety contract above. unsafe { let mut x = 0usize; - if !BE { + if BE == HOST_NATIVE_BE { let shr_count = _mm_cvtsi32_si128((16 - BITS) as i32); let y_idx = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, -1, -1, -1, -1, -1, -1, -1, -1); diff --git a/src/row/dispatch/v210.rs b/src/row/dispatch/v210.rs index 7d1c14b1..7ce9fbfe 100644 --- a/src/row/dispatch/v210.rs +++ b/src/row/dispatch/v210.rs @@ -34,7 +34,7 @@ use crate::{ /// contract. `use_simd = false` forces scalar. `big_endian = true` selects /// the big-endian wire encoding (32-bit words stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_rgb_row( +pub fn v210_to_rgb_row_endian( packed: &[u8], rgb_out: &mut [u8], width: usize, @@ -120,9 +120,25 @@ pub fn v210_to_rgb_row( ); } +/// LE-only wrapper around [`v210_to_rgb_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_rgb_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_rgb_row( + packed: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + v210_to_rgb_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of v210 to packed RGBA (u8) with `α = 0xFF`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_rgba_row( +pub fn v210_to_rgba_row_endian( packed: &[u8], rgba_out: &mut [u8], width: usize, @@ -208,10 +224,26 @@ pub fn v210_to_rgba_row( ); } +/// LE-only wrapper around [`v210_to_rgba_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_rgba_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_rgba_row( + packed: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + v210_to_rgba_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of v210 to packed `u16` RGB at native 10-bit /// depth (low-bit-packed, `[0, 1023]`). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_rgb_u16_row( +pub fn v210_to_rgb_u16_row_endian( packed: &[u8], rgb_out: &mut [u16], width: usize, @@ -301,10 +333,26 @@ pub fn v210_to_rgb_u16_row( ); } +/// LE-only wrapper around [`v210_to_rgb_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_rgb_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_rgb_u16_row( + packed: &[u8], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + v210_to_rgb_u16_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of v210 to packed `u16` RGBA at native 10-bit /// depth with `α = 1023` (10-bit opaque maximum). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_rgba_u16_row( +pub fn v210_to_rgba_u16_row_endian( packed: &[u8], rgba_out: &mut [u16], width: usize, @@ -394,10 +442,26 @@ pub fn v210_to_rgba_u16_row( ); } +/// LE-only wrapper around [`v210_to_rgba_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_rgba_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_rgba_u16_row( + packed: &[u8], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + v210_to_rgba_u16_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Extracts one row of 8-bit luma from a packed v210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_row( +pub fn v210_to_luma_row_endian( packed: &[u8], luma_out: &mut [u8], width: usize, @@ -478,11 +542,20 @@ pub fn v210_to_luma_row( ); } +/// LE-only wrapper around [`v210_to_luma_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_luma_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_luma_row(packed: &[u8], luma_out: &mut [u8], width: usize, use_simd: bool) { + v210_to_luma_row_endian(packed, luma_out, width, use_simd, false) +} + /// Extracts one row of native-depth `u16` luma from a packed v210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v210_to_luma_u16_row( +pub fn v210_to_luma_u16_row_endian( packed: &[u8], luma_out: &mut [u16], width: usize, @@ -563,7 +636,24 @@ pub fn v210_to_luma_u16_row( ); } +/// LE-only wrapper around [`v210_to_luma_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `v210_to_luma_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn v210_to_luma_u16_row(packed: &[u8], luma_out: &mut [u16], width: usize, use_simd: bool) { + v210_to_luma_u16_row_endian(packed, luma_out, width, use_simd, false) +} + #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { //! Smoke tests for the public v210 dispatchers. Walker / kernel //! correctness lives in the per-arch tests @@ -604,7 +694,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 6 * 3]; - v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false, false); + v210_to_rgb_row(&word, &mut rgb, 6, ColorMatrix::Bt709, true, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -613,7 +703,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 6 * 4]; - v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false, false); + v210_to_rgba_row(&word, &mut rgba, 6, ColorMatrix::Bt709, true, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -621,15 +711,7 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 6 * 3]; - v210_to_rgb_u16_row( - &word, - &mut rgb_u16, - 6, - ColorMatrix::Bt709, - true, - false, - false, - ); + v210_to_rgb_u16_row(&word, &mut rgb_u16, 6, ColorMatrix::Bt709, true, false); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -638,29 +720,21 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 6 * 4]; - v210_to_rgba_u16_row( - &word, - &mut rgba_u16, - 6, - ColorMatrix::Bt709, - true, - false, - false, - ); + v210_to_rgba_u16_row(&word, &mut rgba_u16, 6, ColorMatrix::Bt709, true, false); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 6]; - v210_to_luma_row(&word, &mut luma, 6, false, false); + v210_to_luma_row(&word, &mut luma, 6, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 6]; - v210_to_luma_u16_row(&word, &mut luma_u16, 6, false, false); + v210_to_luma_u16_row(&word, &mut luma_u16, 6, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y210.rs b/src/row/dispatch/y210.rs index 97bd0766..e81b7c60 100644 --- a/src/row/dispatch/y210.rs +++ b/src/row/dispatch/y210.rs @@ -34,7 +34,7 @@ use crate::{ /// contract. `use_simd = false` forces scalar. `big_endian = true` selects /// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_rgb_row( +pub fn y210_to_rgb_row_endian( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -120,9 +120,25 @@ pub fn y210_to_rgb_row( ); } +/// LE-only wrapper around [`y210_to_rgb_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_rgb_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_rgb_row( + packed: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y210_to_rgb_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y210 to packed RGBA (u8) with `α = 0xFF`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_rgba_row( +pub fn y210_to_rgba_row_endian( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -208,10 +224,26 @@ pub fn y210_to_rgba_row( ); } +/// LE-only wrapper around [`y210_to_rgba_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_rgba_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_rgba_row( + packed: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y210_to_rgba_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y210 to packed `u16` RGB at native 10-bit /// depth (low-bit-packed, `[0, 1023]`). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_rgb_u16_row( +pub fn y210_to_rgb_u16_row_endian( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -301,10 +333,26 @@ pub fn y210_to_rgb_u16_row( ); } +/// LE-only wrapper around [`y210_to_rgb_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_rgb_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_rgb_u16_row( + packed: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y210_to_rgb_u16_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y210 to packed `u16` RGBA at native 10-bit /// depth with `α = 1023` (10-bit opaque maximum). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_rgba_u16_row( +pub fn y210_to_rgba_u16_row_endian( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -394,10 +442,26 @@ pub fn y210_to_rgba_u16_row( ); } +/// LE-only wrapper around [`y210_to_rgba_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_rgba_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_rgba_u16_row( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y210_to_rgba_u16_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Extracts one row of 8-bit luma from a packed Y210 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_row( +pub fn y210_to_luma_row_endian( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -478,11 +542,20 @@ pub fn y210_to_luma_row( ); } +/// LE-only wrapper around [`y210_to_luma_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_luma_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { + y210_to_luma_row_endian(packed, luma_out, width, use_simd, false) +} + /// Extracts one row of native-depth `u16` luma from a packed Y210 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in /// its low 10 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y210_to_luma_u16_row( +pub fn y210_to_luma_u16_row_endian( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -563,7 +636,24 @@ pub fn y210_to_luma_u16_row( ); } +/// LE-only wrapper around [`y210_to_luma_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y210_to_luma_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y210_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { + y210_to_luma_u16_row_endian(packed, luma_out, width, use_simd, false) +} + #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { //! Smoke tests for the public Y210 dispatchers. Walker / kernel //! correctness lives in the per-arch tests @@ -602,7 +692,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); + y210_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -611,7 +701,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); + y210_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -619,15 +709,7 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y210_to_rgb_u16_row( - &buf, - &mut rgb_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y210_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -636,29 +718,21 @@ mod tests { // u16 RGBA — alpha = 1023. let mut rgba_u16 = [0u16; 8 * 4]; - y210_to_rgba_u16_row( - &buf, - &mut rgba_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y210_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 8]; - y210_to_luma_row(&buf, &mut luma, 8, false, false); + y210_to_luma_row(&buf, &mut luma, 8, false); for &y in &luma { assert_eq!(y, (512u16 >> 2) as u8); } // u16 luma — low-packed 10-bit Y. let mut luma_u16 = [0u16; 8]; - y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); + y210_to_luma_u16_row(&buf, &mut luma_u16, 8, false); for &y in &luma_u16 { assert_eq!(y, 512); } diff --git a/src/row/dispatch/y212.rs b/src/row/dispatch/y212.rs index 2245c50e..ef0c5dfd 100644 --- a/src/row/dispatch/y212.rs +++ b/src/row/dispatch/y212.rs @@ -34,7 +34,7 @@ use crate::{ /// contract. `use_simd = false` forces scalar. `big_endian = true` selects /// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_rgb_row( +pub fn y212_to_rgb_row_endian( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -120,9 +120,25 @@ pub fn y212_to_rgb_row( ); } +/// LE-only wrapper around [`y212_to_rgb_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_rgb_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_rgb_row( + packed: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y212_to_rgb_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y212 to packed RGBA (u8) with `α = 0xFF`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_rgba_row( +pub fn y212_to_rgba_row_endian( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -208,10 +224,26 @@ pub fn y212_to_rgba_row( ); } +/// LE-only wrapper around [`y212_to_rgba_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_rgba_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_rgba_row( + packed: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y212_to_rgba_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y212 to packed `u16` RGB at native 12-bit /// depth (low-bit-packed, `[0, 4095]`). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_rgb_u16_row( +pub fn y212_to_rgb_u16_row_endian( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -301,10 +333,26 @@ pub fn y212_to_rgb_u16_row( ); } +/// LE-only wrapper around [`y212_to_rgb_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_rgb_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_rgb_u16_row( + packed: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y212_to_rgb_u16_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y212 to packed `u16` RGBA at native 12-bit /// depth with `α = 4095` (12-bit opaque maximum). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_rgba_u16_row( +pub fn y212_to_rgba_u16_row_endian( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -394,10 +442,26 @@ pub fn y212_to_rgba_u16_row( ); } +/// LE-only wrapper around [`y212_to_rgba_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_rgba_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_rgba_u16_row( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y212_to_rgba_u16_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Extracts one row of 8-bit luma from a packed Y212 buffer. /// Y values are downshifted from 12-bit to 8-bit via `>> 4`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_row( +pub fn y212_to_luma_row_endian( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -478,11 +542,20 @@ pub fn y212_to_luma_row( ); } +/// LE-only wrapper around [`y212_to_luma_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_luma_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { + y212_to_luma_row_endian(packed, luma_out, width, use_simd, false) +} + /// Extracts one row of native-depth `u16` luma from a packed Y212 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in /// its low 12 bits). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y212_to_luma_u16_row( +pub fn y212_to_luma_u16_row_endian( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -563,7 +636,24 @@ pub fn y212_to_luma_u16_row( ); } +/// LE-only wrapper around [`y212_to_luma_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y212_to_luma_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y212_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { + y212_to_luma_u16_row_endian(packed, luma_out, width, use_simd, false) +} + #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { //! Smoke tests for the public Y212 dispatchers. Walker / kernel //! correctness lives in the per-arch tests @@ -602,7 +692,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); + y212_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -611,7 +701,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); + y212_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -619,15 +709,7 @@ mod tests { // u16 RGB at native 12-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y212_to_rgb_u16_row( - &buf, - &mut rgb_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y212_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(2048) <= 2); assert_eq!(px[0], px[1]); @@ -636,29 +718,21 @@ mod tests { // u16 RGBA — alpha = 4095. let mut rgba_u16 = [0u16; 8 * 4]; - y212_to_rgba_u16_row( - &buf, - &mut rgba_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y212_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 4095); } // u8 luma — Y=2048 → 128 after `>> 4`. let mut luma = [0u8; 8]; - y212_to_luma_row(&buf, &mut luma, 8, false, false); + y212_to_luma_row(&buf, &mut luma, 8, false); for &y in &luma { assert_eq!(y, (2048u16 >> 4) as u8); } // u16 luma — low-packed 12-bit Y. let mut luma_u16 = [0u16; 8]; - y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); + y212_to_luma_u16_row(&buf, &mut luma_u16, 8, false); for &y in &luma_u16 { assert_eq!(y, 2048); } diff --git a/src/row/dispatch/y216.rs b/src/row/dispatch/y216.rs index 541022c7..93be5dd1 100644 --- a/src/row/dispatch/y216.rs +++ b/src/row/dispatch/y216.rs @@ -33,7 +33,7 @@ use crate::{ /// contract. `use_simd = false` forces scalar. `big_endian = true` selects /// the big-endian wire encoding (u16 samples stored MSB-first). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_rgb_row( +pub fn y216_to_rgb_row_endian( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -119,9 +119,25 @@ pub fn y216_to_rgb_row( ); } +/// LE-only wrapper around [`y216_to_rgb_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_rgb_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_rgb_row( + packed: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y216_to_rgb_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y216 to packed RGBA (u8) with `α = 0xFF`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_rgba_row( +pub fn y216_to_rgba_row_endian( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -207,10 +223,26 @@ pub fn y216_to_rgba_row( ); } +/// LE-only wrapper around [`y216_to_rgba_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_rgba_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_rgba_row( + packed: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y216_to_rgba_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y216 to packed `u16` RGB at native 16-bit /// depth (full-range, `[0, 65535]`). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_rgb_u16_row( +pub fn y216_to_rgb_u16_row_endian( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -300,10 +332,26 @@ pub fn y216_to_rgb_u16_row( ); } +/// LE-only wrapper around [`y216_to_rgb_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_rgb_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_rgb_u16_row( + packed: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y216_to_rgb_u16_row_endian(packed, rgb_out, width, matrix, full_range, use_simd, false) +} + /// Converts one row of Y216 to packed `u16` RGBA at native 16-bit /// depth with `α = 0xFFFF` (16-bit opaque maximum). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_rgba_u16_row( +pub fn y216_to_rgba_u16_row_endian( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -393,10 +441,26 @@ pub fn y216_to_rgba_u16_row( ); } +/// LE-only wrapper around [`y216_to_rgba_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_rgba_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_rgba_u16_row( + packed: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + y216_to_rgba_u16_row_endian(packed, rgba_out, width, matrix, full_range, use_simd, false) +} + /// Extracts one row of 8-bit luma from a packed Y216 buffer. /// Y values are downshifted from 16-bit to 8-bit via `>> 8`. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_row( +pub fn y216_to_luma_row_endian( packed: &[u16], luma_out: &mut [u8], width: usize, @@ -477,10 +541,19 @@ pub fn y216_to_luma_row( ); } +/// LE-only wrapper around [`y216_to_luma_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_luma_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { + y216_to_luma_row_endian(packed, luma_out, width, use_simd, false) +} + /// Extracts one row of native-depth `u16` luma from a packed Y216 /// buffer (full-range: each `u16` carries the 16-bit Y value directly). #[cfg_attr(not(tarpaulin), inline(always))] -pub fn y216_to_luma_u16_row( +pub fn y216_to_luma_u16_row_endian( packed: &[u16], luma_out: &mut [u16], width: usize, @@ -561,7 +634,24 @@ pub fn y216_to_luma_u16_row( ); } +/// LE-only wrapper around [`y216_to_luma_u16_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to `y216_to_luma_u16_row_endian( +/// ..., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn y216_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { + y216_to_luma_u16_row_endian(packed, luma_out, width, use_simd, false) +} + #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { //! Smoke tests for the public Y216 dispatchers. Walker / kernel //! correctness lives in the per-arch tests and the scalar reference's @@ -600,7 +690,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); + y216_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -609,7 +699,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); + y216_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -617,15 +707,7 @@ mod tests { // u16 RGB at native 16-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - y216_to_rgb_u16_row( - &buf, - &mut rgb_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y216_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(32768) <= 4); assert_eq!(px[0], px[1]); @@ -634,29 +716,21 @@ mod tests { // u16 RGBA — alpha = 0xFFFF. let mut rgba_u16 = [0u16; 8 * 4]; - y216_to_rgba_u16_row( - &buf, - &mut rgba_u16, - 8, - ColorMatrix::Bt709, - true, - false, - false, - ); + y216_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 0xFFFF); } // u8 luma — Y=32768 → 128 after `>> 8`. let mut luma = [0u8; 8]; - y216_to_luma_row(&buf, &mut luma, 8, false, false); + y216_to_luma_row(&buf, &mut luma, 8, false); for &y in &luma { assert_eq!(y, (32768u16 >> 8) as u8); } // u16 luma — full 16-bit Y value. let mut luma_u16 = [0u16; 8]; - y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); + y216_to_luma_u16_row(&buf, &mut luma_u16, 8, false); for &y in &luma_u16 { assert_eq!(y, 32768); } @@ -668,7 +742,7 @@ mod tests { // packed buffer has only 2 elements for width=4 (needs 8). let packed = [0u16; 2]; let mut rgb = [0u8; 4 * 3]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); } #[test] @@ -677,7 +751,7 @@ mod tests { // output buffer has only 2 bytes for width=4 (needs 12). let packed = [0u16; 8]; let mut rgb = [0u8; 2]; - y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); + y216_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); } #[test] @@ -685,7 +759,7 @@ mod tests { fn y216_dispatcher_rejects_odd_width() { let packed = [0u16; 6]; let mut rgb = [0u8; 9]; - y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false, false); + y216_to_rgb_row(&packed, &mut rgb, 3, ColorMatrix::Bt709, true, false); } #[test] @@ -706,7 +780,6 @@ mod tests { ColorMatrix::Bt709, true, false, - false, ); } } diff --git a/src/row/mod.rs b/src/row/mod.rs index 0a476276..b2502de8 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -944,15 +944,7 @@ mod overflow_tests { let candidate = ((usize::MAX / 16) + 1) * 6; let p: [u8; 0] = []; let mut rgb: [u8; 0] = []; - v210_to_rgb_row( - &p, - &mut rgb, - candidate, - ColorMatrix::Bt601, - true, - false, - false, - ); + v210_to_rgb_row(&p, &mut rgb, candidate, ColorMatrix::Bt601, true, false); } // ---- Y2xx dispatcher — `width × 2` overflow ---- @@ -982,7 +974,6 @@ mod overflow_tests { ColorMatrix::Bt601, true, false, - false, ); } } diff --git a/src/row/scalar/v210.rs b/src/row/scalar/v210.rs index 1b9db248..fc865e47 100644 --- a/src/row/scalar/v210.rs +++ b/src/row/scalar/v210.rs @@ -344,6 +344,14 @@ pub(crate) fn v210_to_luma_u16_row( } #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { use super::*; use crate::ColorMatrix; diff --git a/src/row/scalar/y216.rs b/src/row/scalar/y216.rs index 291e8914..2b3c6903 100644 --- a/src/row/scalar/y216.rs +++ b/src/row/scalar/y216.rs @@ -155,6 +155,14 @@ pub(crate) fn y216_to_luma_u16_row(packed: &[u16], out: &mut [u1 } #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { use super::*; use crate::ColorMatrix; diff --git a/src/row/scalar/y2xx.rs b/src/row/scalar/y2xx.rs index d3c7a8b3..ef13b8d4 100644 --- a/src/row/scalar/y2xx.rs +++ b/src/row/scalar/y2xx.rs @@ -344,6 +344,14 @@ pub(crate) fn y212_to_luma_u16_row( } #[cfg(all(test, feature = "std"))] +// LE-host-only: tests in this module use host-native u16/u8 literals as if +// they were LE-encoded bytes; on a BE host the kernel's `from_le` byte-swap +// reinterprets host-native storage and produces a different logical value +// than the literal, breaking the assertions. The kernel's BE-host correctness +// is locked down by the dedicated host-independent BE/LE parity tests in the +// per-arch test files (which build fixtures via `to_le_bytes` / `to_be_bytes`, +// not `swap_bytes`). Mirrors the gating from PR #82 `8f2e329`. +#[cfg(target_endian = "little")] mod tests { use super::*; use crate::ColorMatrix; diff --git a/src/sinker/mixed/v210.rs b/src/sinker/mixed/v210.rs index e59a624a..5db6c557 100644 --- a/src/sinker/mixed/v210.rs +++ b/src/sinker/mixed/v210.rs @@ -39,9 +39,9 @@ use super::{ use crate::{ PixelSink, row::{ - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, v210_to_luma_row, - v210_to_luma_u16_row, v210_to_rgb_row, v210_to_rgb_u16_row, v210_to_rgba_row, - v210_to_rgba_u16_row, + expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, + v210_to_luma_row_endian, v210_to_luma_u16_row_endian, v210_to_rgb_row_endian, + v210_to_rgb_u16_row_endian, v210_to_rgba_row_endian, v210_to_rgba_u16_row_endian, }, yuv::{V210, V210Row, V210Sink}, }; @@ -207,7 +207,7 @@ impl PixelSink for MixedSinker<'_, V210> { // Luma u8 — extract 8-bit Y bytes from the v210 plane via the // dedicated kernel (downshifts 10→8 inline). if let Some(buf) = luma.as_deref_mut() { - v210_to_luma_row( + v210_to_luma_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -217,7 +217,7 @@ impl PixelSink for MixedSinker<'_, V210> { } // Luma u16 — extract 10-bit Y values at native depth. if let Some(buf) = luma_u16.as_deref_mut() { - v210_to_luma_u16_row( + v210_to_luma_u16_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -236,7 +236,7 @@ impl PixelSink for MixedSinker<'_, V210> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - v210_to_rgba_u16_row( + v210_to_rgba_u16_row_endian( packed, rgba_u16_row, w, @@ -257,7 +257,7 @@ impl PixelSink for MixedSinker<'_, V210> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - v210_to_rgb_u16_row( + v210_to_rgb_u16_row_endian( packed, rgb_u16_row, w, @@ -288,7 +288,7 @@ impl PixelSink for MixedSinker<'_, V210> { if want_rgba && !need_u8_rgb_kernel { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - v210_to_rgba_row( + v210_to_rgba_row_endian( packed, rgba_row, w, @@ -312,7 +312,7 @@ impl PixelSink for MixedSinker<'_, V210> { w, h, )?; - v210_to_rgb_row( + v210_to_rgb_row_endian( packed, rgb_row, w, diff --git a/src/sinker/mixed/y210.rs b/src/sinker/mixed/y210.rs index 430b2955..3fa4ed3e 100644 --- a/src/sinker/mixed/y210.rs +++ b/src/sinker/mixed/y210.rs @@ -40,9 +40,9 @@ use super::{ use crate::{ PixelSink, row::{ - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, y210_to_luma_row, - y210_to_luma_u16_row, y210_to_rgb_row, y210_to_rgb_u16_row, y210_to_rgba_row, - y210_to_rgba_u16_row, + expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, + y210_to_luma_row_endian, y210_to_luma_u16_row_endian, y210_to_rgb_row_endian, + y210_to_rgb_u16_row_endian, y210_to_rgba_row_endian, y210_to_rgba_u16_row_endian, }, yuv::{Y210, Y210Row, Y210Sink}, }; @@ -208,7 +208,7 @@ impl PixelSink for MixedSinker<'_, Y210> { // Luma u8 — extract 8-bit Y bytes from the Y210 plane via the // dedicated kernel (downshifts MSB-aligned 10→8 inline). if let Some(buf) = luma.as_deref_mut() { - y210_to_luma_row( + y210_to_luma_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -218,7 +218,7 @@ impl PixelSink for MixedSinker<'_, Y210> { } // Luma u16 — extract 10-bit Y values at native depth. if let Some(buf) = luma_u16.as_deref_mut() { - y210_to_luma_u16_row( + y210_to_luma_u16_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -237,7 +237,7 @@ impl PixelSink for MixedSinker<'_, Y210> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - y210_to_rgba_u16_row( + y210_to_rgba_u16_row_endian( packed, rgba_u16_row, w, @@ -258,7 +258,7 @@ impl PixelSink for MixedSinker<'_, Y210> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - y210_to_rgb_u16_row( + y210_to_rgb_u16_row_endian( packed, rgb_u16_row, w, @@ -289,7 +289,7 @@ impl PixelSink for MixedSinker<'_, Y210> { if want_rgba && !need_u8_rgb_kernel { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - y210_to_rgba_row( + y210_to_rgba_row_endian( packed, rgba_row, w, @@ -313,7 +313,7 @@ impl PixelSink for MixedSinker<'_, Y210> { w, h, )?; - y210_to_rgb_row( + y210_to_rgb_row_endian( packed, rgb_row, w, diff --git a/src/sinker/mixed/y212.rs b/src/sinker/mixed/y212.rs index 1582e61e..35270f89 100644 --- a/src/sinker/mixed/y212.rs +++ b/src/sinker/mixed/y212.rs @@ -39,9 +39,9 @@ use super::{ use crate::{ PixelSink, row::{ - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, y212_to_luma_row, - y212_to_luma_u16_row, y212_to_rgb_row, y212_to_rgb_u16_row, y212_to_rgba_row, - y212_to_rgba_u16_row, + expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, + y212_to_luma_row_endian, y212_to_luma_u16_row_endian, y212_to_rgb_row_endian, + y212_to_rgb_u16_row_endian, y212_to_rgba_row_endian, y212_to_rgba_u16_row_endian, }, yuv::{Y212, Y212Row, Y212Sink}, }; @@ -206,7 +206,7 @@ impl PixelSink for MixedSinker<'_, Y212> { // Luma u8 — extract 8-bit Y bytes from the Y212 plane via the // dedicated kernel (downshifts MSB-aligned 12→8 inline). if let Some(buf) = luma.as_deref_mut() { - y212_to_luma_row( + y212_to_luma_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -216,7 +216,7 @@ impl PixelSink for MixedSinker<'_, Y212> { } // Luma u16 — extract 12-bit Y values at native depth. if let Some(buf) = luma_u16.as_deref_mut() { - y212_to_luma_u16_row( + y212_to_luma_u16_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -235,7 +235,7 @@ impl PixelSink for MixedSinker<'_, Y212> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - y212_to_rgba_u16_row( + y212_to_rgba_u16_row_endian( packed, rgba_u16_row, w, @@ -256,7 +256,7 @@ impl PixelSink for MixedSinker<'_, Y212> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - y212_to_rgb_u16_row( + y212_to_rgb_u16_row_endian( packed, rgb_u16_row, w, @@ -287,7 +287,7 @@ impl PixelSink for MixedSinker<'_, Y212> { if want_rgba && !need_u8_rgb_kernel { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - y212_to_rgba_row( + y212_to_rgba_row_endian( packed, rgba_row, w, @@ -311,7 +311,7 @@ impl PixelSink for MixedSinker<'_, Y212> { w, h, )?; - y212_to_rgb_row( + y212_to_rgb_row_endian( packed, rgb_row, w, diff --git a/src/sinker/mixed/y216.rs b/src/sinker/mixed/y216.rs index 4fdbb951..6637fd08 100644 --- a/src/sinker/mixed/y216.rs +++ b/src/sinker/mixed/y216.rs @@ -41,9 +41,9 @@ use super::{ use crate::{ PixelSink, row::{ - expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, y216_to_luma_row, - y216_to_luma_u16_row, y216_to_rgb_row, y216_to_rgb_u16_row, y216_to_rgba_row, - y216_to_rgba_u16_row, + expand_rgb_to_rgba_row, expand_rgb_u16_to_rgba_u16_row, rgb_to_hsv_row, + y216_to_luma_row_endian, y216_to_luma_u16_row_endian, y216_to_rgb_row_endian, + y216_to_rgb_u16_row_endian, y216_to_rgba_row_endian, y216_to_rgba_u16_row_endian, }, yuv::{Y216, Y216Row, Y216Sink}, }; @@ -208,7 +208,7 @@ impl PixelSink for MixedSinker<'_, Y216> { // Luma u8 — extract 8-bit Y bytes from the Y216 plane via the // dedicated kernel (downshifts MSB-aligned 16→8 inline). if let Some(buf) = luma.as_deref_mut() { - y216_to_luma_row( + y216_to_luma_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -219,7 +219,7 @@ impl PixelSink for MixedSinker<'_, Y216> { // Luma u16 — extract 16-bit Y values at native depth (direct // memcpy — no shift needed for full 16-bit samples). if let Some(buf) = luma_u16.as_deref_mut() { - y216_to_luma_u16_row( + y216_to_luma_u16_row_endian( packed, &mut buf[one_plane_start..one_plane_end], w, @@ -238,7 +238,7 @@ impl PixelSink for MixedSinker<'_, Y216> { let rgba_u16_buf = rgba_u16.as_deref_mut().unwrap(); let rgba_u16_row = rgba_u16_plane_row_slice(rgba_u16_buf, one_plane_start, one_plane_end, w, h)?; - y216_to_rgba_u16_row( + y216_to_rgba_u16_row_endian( packed, rgba_u16_row, w, @@ -259,7 +259,7 @@ impl PixelSink for MixedSinker<'_, Y216> { })?; let rgb_plane_start = one_plane_start * 3; let rgb_u16_row = &mut rgb_u16_buf[rgb_plane_start..rgb_plane_end]; - y216_to_rgb_u16_row( + y216_to_rgb_u16_row_endian( packed, rgb_u16_row, w, @@ -290,7 +290,7 @@ impl PixelSink for MixedSinker<'_, Y216> { if want_rgba && !need_u8_rgb_kernel { let rgba_buf = rgba.as_deref_mut().unwrap(); let rgba_row = rgba_plane_row_slice(rgba_buf, one_plane_start, one_plane_end, w, h)?; - y216_to_rgba_row( + y216_to_rgba_row_endian( packed, rgba_row, w, @@ -314,7 +314,7 @@ impl PixelSink for MixedSinker<'_, Y216> { w, h, )?; - y216_to_rgb_row( + y216_to_rgb_row_endian( packed, rgb_row, w,