From 560dcdab244a0782c3022bbe499a24fde687d4bb Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Thu, 7 May 2026 23:44:54 +1200 Subject: [PATCH 1/8] feat(be-yuv-hb): BE support for high-bit YUV planar + P-format row kernels (scalar + NEON) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds `` to all scalar and NEON row kernels for high-bit YUV planar (yuv420p/422p/444p 9–16-bit, yuva 4:2:0/4:2:2/4:4:4) and P-format semi-planar (P010, P012, P016, P410, P412, P416) → RGB/RGBA conversion. BE loads use `load_u16::` at scalar sites and `load_endian_u16x8::` / `deinterleave_endian::` at NEON SIMD sites; x86 and wasm backends wired as `` pending a follow-up tranche. All 2159 existing tests pass; dispatch + test call sites forward `` to preserve LE-only behaviour until BE callers land. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../arch/neon/subsampled_high_bit_pn_4_2_0.rs | 156 ++++++++----- .../arch/neon/subsampled_high_bit_pn_4_4_4.rs | 178 ++++++++++----- src/row/arch/neon/tests/high_bit_4_2_0.rs | 72 +++--- .../arch/neon/tests/high_bit_4_4_4_and_pn.rs | 68 +++--- src/row/arch/neon/tests/yuva.rs | 44 ++-- src/row/arch/neon/yuv_planar_16bit.rs | 172 ++++++++------ src/row/arch/neon/yuv_planar_high_bit.rs | 216 ++++++++++++------ src/row/dispatch/pn.rs | 120 +++++----- src/row/dispatch/yuv420/p010.rs | 56 ++--- src/row/dispatch/yuv420/p012.rs | 52 ++--- src/row/dispatch/yuv420/p016.rs | 48 ++-- src/row/dispatch/yuv420/yuv420p10.rs | 48 ++-- src/row/dispatch/yuv420/yuv420p12.rs | 48 ++-- src/row/dispatch/yuv420/yuv420p14.rs | 48 ++-- src/row/dispatch/yuv420/yuv420p16.rs | 48 ++-- src/row/dispatch/yuv420/yuv420p9.rs | 48 ++-- src/row/dispatch/yuv444/mod.rs | 24 +- src/row/dispatch/yuv444/yuv444p10.rs | 26 +-- src/row/dispatch/yuv444/yuv444p12.rs | 24 +- src/row/dispatch/yuv444/yuv444p14.rs | 24 +- src/row/dispatch/yuv444/yuv444p16.rs | 48 ++-- src/row/dispatch/yuv444/yuv444p9.rs | 26 +-- src/row/dispatch/yuva/sub_4_2_0.rs | 96 ++++---- src/row/dispatch/yuva/sub_4_4_4.rs | 120 +++++----- src/row/scalar/mod.rs | 11 + src/row/scalar/subsampled_high_bit_pn.rs | 115 +++++----- src/row/scalar/tests.rs | 52 ++--- src/row/scalar/yuv_planar_16bit.rs | 146 ++++++------ src/row/scalar/yuv_planar_high_bit.rs | 113 +++++---- src/sinker/mixed/tests/yuva/sub_4_2_0.rs | 24 +- src/sinker/mixed/tests/yuva/sub_4_2_2.rs | 32 +-- src/sinker/mixed/tests/yuva/sub_4_4_4.rs | 44 ++-- 32 files changed, 1291 insertions(+), 1056 deletions(-) diff --git a/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs index 1224a04b..7381c514 100644 --- a/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs @@ -4,6 +4,27 @@ use crate::{ColorMatrix, row::scalar}; use super::*; +/// Byte-swap every u16 lane in `v` (BE ↔ LE conversion in-register). +/// +/// Equivalent to `vrev16q_u8` on the reinterpreted byte view. Used +/// after `vld2q_u16` to apply per-lane byte-swapping that +/// `load_endian_u16x8` cannot perform for interleaved loads. +#[inline(always)] +unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { + unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } +} + +/// Apply BE byte-swap to a `uint16x8x2_t` pair (each lane individually). +/// When `BE = false` this is a no-op and compiles away entirely. +#[inline(always)] +unsafe fn deinterleave_endian(pair: uint16x8x2_t) -> uint16x8x2_t { + if BE { + unsafe { uint16x8x2_t(byteswap_u16x8(pair.0), byteswap_u16x8(pair.1)) } + } else { + pair + } +} + /// NEON high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}: P010, P012) /// → packed **8‑bit** RGB. /// @@ -24,8 +45,8 @@ use super::*; /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] across all -/// supported `BITS` values. +/// Byte‑identical to [`scalar::p_n_to_rgb_row::`] across all +/// supported `BITS` values and endian modes. /// /// # Safety /// @@ -38,7 +59,7 @@ use super::*; /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -48,7 +69,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -63,7 +84,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgba_row( +pub(crate) unsafe fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -73,7 +94,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -90,7 +111,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( /// 4. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -132,16 +153,21 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - // 16 Y pixels in two u16x8 loads, right-shifted by 16-BITS to - // extract the active bits from the high-bit packing. - let y_vec_lo = vshlq_u16(vld1q_u16(y.as_ptr().add(x)), shr_count); - let y_vec_hi = vshlq_u16(vld1q_u16(y.as_ptr().add(x + 8)), shr_count); + // 16 Y pixels in two u16x8 loads; BE-swapped before the high-bit + // extraction shift (swap_bytes must precede >> (16-BITS) to get + // correct active-bit alignment from BE wire format). + let y_raw_lo = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_raw_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let y_vec_lo = vshlq_u16(y_raw_lo, shr_count); + let y_vec_hi = vshlq_u16(y_raw_hi, shr_count); // Semi‑planar UV: `vld2q_u16` loads 16 interleaved `u16` elements - // and returns (evens, odds) = (U, V) in one shot. - let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = vshlq_u16(uv_pair.0, shr_count); - let v_vec = vshlq_u16(uv_pair.1, shr_count); + // and returns (evens, odds) = (U, V) in one shot. For BE input, + // byte-swap each deinterleaved vector before the shift. + let uv_raw = vld2q_u16(uv_half.as_ptr().add(x)); + let uv_swapped = deinterleave_endian::(uv_raw); + let u_vec = vshlq_u16(uv_swapped.0, shr_count); + let v_vec = vshlq_u16(uv_swapped.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -203,9 +229,9 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -227,8 +253,8 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// /// # Numerical contract /// -/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the -/// monomorphized `BITS`. +/// Byte‑identical to [`scalar::p_n_to_rgb_u16_row::`] for the +/// monomorphized `BITS` and endian mode. /// /// # Safety /// @@ -238,7 +264,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -247,7 +273,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -261,7 +287,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// Same as [`p_n_to_rgb_u16_row`], plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -270,7 +296,7 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -288,7 +314,11 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -329,11 +359,14 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(y.as_ptr().add(x) as *const u8); + let y_raw_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let y_vec_lo = vshlq_u16(y_raw_lo, shr_count); + let y_vec_hi = vshlq_u16(y_raw_hi, shr_count); + let uv_raw = vld2q_u16(uv_half.as_ptr().add(x)); + let uv_swapped = deinterleave_endian::(uv_raw); + let u_vec = vshlq_u16(uv_swapped.0, shr_count); + let v_vec = vshlq_u16(uv_swapped.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -393,9 +426,13 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -403,7 +440,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row`]. /// /// # Safety /// @@ -414,7 +451,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -424,7 +461,7 @@ pub(crate) unsafe fn p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -438,7 +475,7 @@ pub(crate) unsafe fn p16_to_rgb_row( /// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p16_to_rgba_row( +pub(crate) unsafe fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -448,7 +485,7 @@ pub(crate) unsafe fn p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -464,7 +501,7 @@ pub(crate) unsafe fn p16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -499,11 +536,13 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vld1q_u16(y.as_ptr().add(x)); - let y_vec_hi = vld1q_u16(y.as_ptr().add(x + 8)); - let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = uv_pair.0; - let v_vec = uv_pair.1; + // P016 has no shift — load, optionally byte-swap, direct use. + let y_vec_lo = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let uv_raw = vld2q_u16(uv_half.as_ptr().add(x)); + let uv_swapped = deinterleave_endian::(uv_raw); + let u_vec = uv_swapped.0; + let v_vec = uv_swapped.1; let u_lo_i32 = vsubq_s32( vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_vec))), @@ -571,9 +610,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -581,7 +620,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// NEON P016 (semi-planar 16-bit) → packed native-depth u16 RGB. /// -/// Byte-identical to [`scalar::p16_to_rgb_u16_row`]. +/// Byte-identical to [`scalar::p16_to_rgb_u16_row::`]. /// /// # Safety /// @@ -590,7 +629,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p16_to_rgb_u16_row( +pub(crate) unsafe fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -599,19 +638,19 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } /// NEON sibling of [`p16_to_rgba_row`] for native-depth `u16` output. -/// Alpha is `0xFFFF` — matches `scalar::p16_to_rgba_u16_row`. +/// Alpha is `0xFFFF` — matches `scalar::p16_to_rgba_u16_row::`. /// /// # Safety /// /// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p16_to_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -620,7 +659,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -636,7 +675,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -672,11 +711,12 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vld1q_u16(y.as_ptr().add(x)); - let y_vec_hi = vld1q_u16(y.as_ptr().add(x + 8)); - let uv_pair = vld2q_u16(uv_half.as_ptr().add(x)); - let u_vec = uv_pair.0; - let v_vec = uv_pair.1; + let y_vec_lo = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let uv_raw = vld2q_u16(uv_half.as_ptr().add(x)); + let uv_swapped = deinterleave_endian::(uv_raw); + let u_vec = uv_swapped.0; + let v_vec = uv_swapped.1; let u_lo_i32 = vsubq_s32( vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_vec))), @@ -782,9 +822,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } diff --git a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs index bad975d4..ae731333 100644 --- a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs @@ -14,6 +14,27 @@ use super::*; // register, like `nv24_to_rgb_row`. Each iteration consumes 16 Y // pixels and 32 UV `u16` elements (= 16 interleaved U/V pairs). +/// Byte-swap every u16 lane in `v` (BE ↔ LE conversion in-register). +/// +/// Equivalent to `vrev16q_u8` on the reinterpreted byte view. Used +/// after `vld2q_u16` to apply per-lane byte-swapping that +/// `load_endian_u16x8` cannot perform for interleaved loads. +#[inline(always)] +unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { + unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } +} + +/// Apply BE byte-swap to a `uint16x8x2_t` pair (each lane individually). +/// When `BE = false` this is a no-op and compiles away entirely. +#[inline(always)] +unsafe fn deinterleave_endian(pair: uint16x8x2_t) -> uint16x8x2_t { + if BE { + unsafe { uint16x8x2_t(byteswap_u16x8(pair.0), byteswap_u16x8(pair.1)) } + } else { + pair + } +} + /// NEON Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB. /// /// Thin wrapper over [`p_n_444_to_rgb_or_rgba_row`] with `ALPHA = false`. @@ -25,7 +46,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgb_row( +pub(crate) unsafe fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -35,7 +56,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -49,7 +70,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( /// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -59,7 +80,7 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -76,7 +97,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -116,19 +141,24 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(y.as_ptr().add(x) as *const u8); + let y_raw_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let y_vec_lo = vshlq_u16(y_raw_lo, shr_count); + let y_vec_hi = vshlq_u16(y_raw_hi, shr_count); // 32 UV elements = 16 interleaved (U, V) pairs. Two `vld2q_u16` // calls deinterleave them into two pairs of (U, V) u16x8 vectors. - let uv_pair_lo = vld2q_u16(uv_full.as_ptr().add(x * 2)); - let uv_pair_hi = vld2q_u16(uv_full.as_ptr().add(x * 2 + 16)); - let u_lo_u16 = vshlq_u16(uv_pair_lo.0, shr_count); - let v_lo_u16 = vshlq_u16(uv_pair_lo.1, shr_count); - let u_hi_u16 = vshlq_u16(uv_pair_hi.0, shr_count); - let v_hi_u16 = vshlq_u16(uv_pair_hi.1, shr_count); + // Byte-swap after deinterleave for BE input. + let uv_raw_lo = vld2q_u16(uv_full.as_ptr().add(x * 2)); + let uv_raw_hi = vld2q_u16(uv_full.as_ptr().add(x * 2 + 16)); + let uv_sw_lo = deinterleave_endian::(uv_raw_lo); + let uv_sw_hi = deinterleave_endian::(uv_raw_hi); + let u_lo_u16 = vshlq_u16(uv_sw_lo.0, shr_count); + let v_lo_u16 = vshlq_u16(uv_sw_lo.1, shr_count); + let u_hi_u16 = vshlq_u16(uv_sw_hi.0, shr_count); + let v_hi_u16 = vshlq_u16(uv_sw_hi.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -199,9 +229,13 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgba_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgb_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -219,7 +253,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -229,7 +263,9 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } @@ -244,7 +280,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -254,7 +290,9 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -271,7 +309,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( /// 3. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -310,15 +352,19 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row(y.as_ptr().add(x) as *const u8); + let y_raw_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let y_vec_lo = vshlq_u16(y_raw_lo, shr_count); + let y_vec_hi = vshlq_u16(y_raw_hi, shr_count); + + let uv_raw_lo = vld2q_u16(uv_full.as_ptr().add(x * 2)); + let uv_raw_hi = vld2q_u16(uv_full.as_ptr().add(x * 2 + 16)); + let uv_sw_lo = deinterleave_endian::(uv_raw_lo); + let uv_sw_hi = deinterleave_endian::(uv_raw_hi); + let u_lo_u16 = vshlq_u16(uv_sw_lo.0, shr_count); + let v_lo_u16 = vshlq_u16(uv_sw_lo.1, shr_count); + let u_hi_u16 = vshlq_u16(uv_sw_hi.0, shr_count); + let v_hi_u16 = vshlq_u16(uv_sw_hi.1, shr_count); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -386,11 +432,11 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + scalar::p_n_444_to_rgba_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } else { - scalar::p_n_444_to_rgb_u16_row::( + scalar::p_n_444_to_rgb_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } @@ -412,7 +458,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgb_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -422,7 +468,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -437,7 +483,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -447,7 +493,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -463,7 +509,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -497,17 +543,20 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vld1q_u16(y.as_ptr().add(x)); - let y_vec_hi = vld1q_u16(y.as_ptr().add(x + 8)); + // P416 has no shift — load + optional byte-swap, direct use. + let y_vec_lo = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); // 16 chroma pairs per iter — two `vld2q_u16` calls deinterleave // 32 UV `u16` elements into two pairs of (U, V) u16x8 vectors. - let uv_pair_lo = vld2q_u16(uv_full.as_ptr().add(x * 2)); - let uv_pair_hi = vld2q_u16(uv_full.as_ptr().add(x * 2 + 16)); - let u_vec_lo = uv_pair_lo.0; - let v_vec_lo = uv_pair_lo.1; - let u_vec_hi = uv_pair_hi.0; - let v_vec_hi = uv_pair_hi.1; + let uv_raw_lo = vld2q_u16(uv_full.as_ptr().add(x * 2)); + let uv_raw_hi = vld2q_u16(uv_full.as_ptr().add(x * 2 + 16)); + let uv_sw_lo = deinterleave_endian::(uv_raw_lo); + let uv_sw_hi = deinterleave_endian::(uv_raw_hi); + let u_vec_lo = uv_sw_lo.0; + let v_vec_lo = uv_sw_lo.1; + let u_vec_hi = uv_sw_hi.0; + let v_vec_hi = uv_sw_hi.1; // Unsigned-widen + bias subtract in i32 (16-bit chroma can't fit // i16 after subtracting 32768). @@ -593,9 +642,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -612,7 +661,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -622,13 +671,15 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } /// NEON sibling of [`p_n_444_16_to_rgba_row`] for native-depth `u16` /// output. Alpha samples are `0xFFFF` (opaque maximum at u16 range) — -/// matches `scalar::p_n_444_16_to_rgba_u16_row`. +/// matches `scalar::p_n_444_16_to_rgba_u16_row::`. /// /// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. /// @@ -637,7 +688,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( /// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -647,7 +698,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -663,7 +716,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -700,10 +753,11 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( while x + 8 <= width { // 8 Y + 8 chroma pairs per iter — tighter block because i64 // chroma narrows throughput; matches `yuv_444p16_to_rgb_u16_row`. - let y_vec = vld1q_u16(y.as_ptr().add(x)); - let uv_pair = vld2q_u16(uv_full.as_ptr().add(x * 2)); - let u_vec = uv_pair.0; - let v_vec = uv_pair.1; + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let uv_raw = vld2q_u16(uv_full.as_ptr().add(x * 2)); + let uv_swapped = deinterleave_endian::(uv_raw); + let u_vec = uv_swapped.0; + let v_vec = uv_swapped.1; let u_lo_i32 = vsubq_s32( vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_vec))), @@ -773,9 +827,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } diff --git a/src/row/arch/neon/tests/high_bit_4_2_0.rs b/src/row/arch/neon/tests/high_bit_4_2_0.rs index 07e360b9..d6368382 100644 --- a/src/row/arch/neon/tests/high_bit_4_2_0.rs +++ b/src/row/arch/neon/tests/high_bit_4_2_0.rs @@ -18,9 +18,9 @@ fn check_p10_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -43,9 +43,9 @@ fn check_p10_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { @@ -131,9 +131,9 @@ fn check_p_n_u8_equivalence(width: usize, matrix: ColorMatrix, let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -148,9 +148,9 @@ fn check_p_n_u16_equivalence(width: usize, matrix: ColorMatrix, let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -239,9 +239,9 @@ fn neon_p10_matches_scalar_on_out_of_range_samples() { for full_range in [true, false] { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -250,7 +250,7 @@ fn neon_p10_matches_scalar_on_out_of_range_samples() { let mut rgb16_scalar = std::vec![0u16; width * 3]; let mut rgb16_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>( + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( &y, &u, &v, @@ -260,7 +260,7 @@ fn neon_p10_matches_scalar_on_out_of_range_samples() { full_range, ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -290,9 +290,9 @@ fn check_p010_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -315,9 +315,9 @@ fn check_p010_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: boo let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } if rgb_scalar != rgb_neon { let diff = rgb_scalar @@ -418,9 +418,9 @@ fn neon_p010_matches_scalar_on_mispacked_input() { for full_range in [true, false] { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -429,9 +429,9 @@ fn neon_p010_matches_scalar_on_mispacked_input() { let mut rgb16_scalar = std::vec![0u16; width * 3]; let mut rgb16_neon = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb16_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb16_neon, width, matrix, full_range); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -454,9 +454,9 @@ fn check_planar_u8_neon_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_neon, "NEON planar {BITS}-bit → u8 diverges"); } @@ -471,9 +471,9 @@ fn check_planar_u16_neon_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -492,9 +492,9 @@ fn check_pn_u8_neon_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u8 diverges"); } @@ -510,9 +510,9 @@ fn check_pn_u16_neon_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_neon, "NEON Pn {BITS}-bit → u16 diverges"); } @@ -592,9 +592,9 @@ fn check_planar_u8_neon_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -613,9 +613,9 @@ fn check_pn_u8_neon_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -687,9 +687,9 @@ fn check_yuv420p16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, f let v = p16_plane_neon(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -704,9 +704,9 @@ fn check_p016_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_r let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p16_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, diff --git a/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs index 13ceaca8..dc1accc2 100644 --- a/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs @@ -21,7 +21,7 @@ fn check_planar_u16_neon_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( &y, &u, &v, @@ -31,7 +31,7 @@ fn check_planar_u16_neon_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -50,9 +50,9 @@ fn check_pn_u16_neon_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -124,9 +124,9 @@ fn check_yuv420p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_neon(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -141,9 +141,9 @@ fn check_p016_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p16_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -204,9 +204,9 @@ fn check_yuv444p_n_u8_neon_equivalence( let v = planar_n_plane::(width, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -224,9 +224,9 @@ fn check_yuv444p_n_u16_neon_equivalence( let v = planar_n_plane::(width, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -318,9 +318,9 @@ fn check_yuv444p16_u8_neon_equivalence(width: usize, matrix: ColorMatrix, full_r let v = p16_plane_neon(width, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -334,9 +334,9 @@ fn check_yuv444p16_u16_neon_equivalence(width: usize, matrix: ColorMatrix, full_ let v = p16_plane_neon(width, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -384,9 +384,9 @@ fn check_p_n_444_u8_neon_equivalence( let uv = interleave_uv(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_444_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -405,9 +405,9 @@ fn check_p_n_444_u16_neon_equivalence( let uv = interleave_uv(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -422,9 +422,9 @@ fn check_p_n_444_16_u8_neon_equivalence(width: usize, matrix: ColorMatrix, full_ let uv = interleave_uv(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -439,9 +439,9 @@ fn check_p_n_444_16_u16_neon_equivalence(width: usize, matrix: ColorMatrix, full let uv = interleave_uv(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_neon, width, matrix, full_range); + p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_neon, @@ -534,9 +534,9 @@ fn check_yuv444p_n_u8_neon_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -555,9 +555,9 @@ fn check_pn_444_u8_neon_rgba_equivalence( let uv = interleave_uv(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -571,9 +571,9 @@ fn check_yuv444p16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, f let v = p16_plane_neon(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -588,9 +588,9 @@ fn check_p_n_444_16_u8_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let uv = interleave_uv(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -688,7 +688,7 @@ fn check_yuv444p16_u8_neon_rgba_with_alpha_src_equivalence( let a_src = p16_plane_neon(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -699,7 +699,7 @@ fn check_yuv444p16_u8_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_with_alpha_src_row( + yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/neon/tests/yuva.rs b/src/row/arch/neon/tests/yuva.rs index c8f6adba..1f871035 100644 --- a/src/row/arch/neon/tests/yuva.rs +++ b/src/row/arch/neon/tests/yuva.rs @@ -20,7 +20,7 @@ fn check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -31,7 +31,7 @@ fn check_yuv444p_n_u8_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_with_alpha_src_row::( + yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -196,7 +196,7 @@ fn check_yuv420p_n_u8_neon_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -207,7 +207,7 @@ fn check_yuv420p_n_u8_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_with_alpha_src_row::( + yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -236,7 +236,7 @@ fn check_yuv420p16_u8_neon_rgba_with_alpha_src_equivalence( let a_src = p16_plane_neon(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -247,7 +247,7 @@ fn check_yuv420p16_u8_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_with_alpha_src_row( + yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -380,7 +380,7 @@ fn check_yuv444p_n_u16_neon_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( &y, &u, &v, @@ -390,7 +390,7 @@ fn check_yuv444p_n_u16_neon_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -409,9 +409,9 @@ fn check_pn_444_u16_neon_rgba_equivalence( let uv = interleave_uv(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -425,9 +425,9 @@ fn check_yuv444p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_neon(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -442,9 +442,9 @@ fn check_p_n_444_16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let uv = interleave_uv(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_neon, width, matrix, full_range); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_neon, @@ -542,7 +542,7 @@ fn check_yuv444p16_u16_neon_rgba_with_alpha_src_equivalence( let a_src = p16_plane_neon(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -553,7 +553,7 @@ fn check_yuv444p16_u16_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_u16_with_alpha_src_row( + yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -639,7 +639,7 @@ fn check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -650,7 +650,7 @@ fn check_yuv444p_n_u16_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -778,7 +778,7 @@ fn check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -789,7 +789,7 @@ fn check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -818,7 +818,7 @@ fn check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence( let a_src = p16_plane_neon(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -829,7 +829,7 @@ fn check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_u16_with_alpha_src_row( + yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/neon/yuv_planar_16bit.rs b/src/row/arch/neon/yuv_planar_16bit.rs index 2a876069..7a5f3774 100644 --- a/src/row/arch/neon/yuv_planar_16bit.rs +++ b/src/row/arch/neon/yuv_planar_16bit.rs @@ -33,7 +33,7 @@ use super::*; /// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p16_to_rgb_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -44,7 +44,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -59,7 +59,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( /// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p16_to_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -70,7 +70,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -90,7 +90,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -102,7 +102,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -135,7 +135,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -178,10 +182,10 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_vec = endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8); // Unsigned-widen U/V to i32, subtract bias (32768 — does not fit i16). let u_lo_i32 = vsubq_s32( @@ -238,12 +242,14 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row> 8` to - // fit u8. `vshrq_n_u16` takes a const literal shift; 8 is - // a literal here so the intrinsic is well-formed. + // 16-bit alpha is full-range u16 — byte-swap if BE, then + // `>> 8` to fit u8. `vshrq_n_u16` takes a const literal + // shift; 8 is a literal so the intrinsic is well-formed. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_u16 = vshrq_n_u16::<8>(vld1q_u16(a_ptr.add(x))); - let a_hi_u16 = vshrq_n_u16::<8>(vld1q_u16(a_ptr.add(x + 8))); + let a_lo_raw = endian::load_endian_u16x8::(a_ptr.add(x) as *const u8); + let a_hi_raw = endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8); + let a_lo_u16 = vshrq_n_u16::<8>(a_lo_raw); + let a_hi_u16 = vshrq_n_u16::<8>(a_hi_raw); vcombine_u8(vqmovn_u16(a_lo_u16), vqmovn_u16(a_hi_u16)) } else { alpha_u8 @@ -267,15 +273,17 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_row( + scalar::yuv_420p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -295,7 +303,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -305,7 +313,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -319,7 +327,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -329,7 +337,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -349,7 +357,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -361,7 +369,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -393,7 +401,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -437,10 +449,10 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_vec = endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8); let u_lo_i32 = vsubq_s32( vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_vec))), @@ -526,10 +538,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row(a_ptr.add(x) as *const u8), + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + ) } else { (alpha_u16, alpha_u16) }; @@ -563,15 +578,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_u16_row( + scalar::yuv_420p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_u16_row( + scalar::yuv_420p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -589,7 +604,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -600,7 +615,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -617,7 +632,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( /// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p16_to_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -628,7 +643,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -648,7 +663,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -660,7 +675,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -691,7 +706,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -733,12 +752,12 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row(y.as_ptr().add(x) as *const u8); + let y_vec_hi = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_vec_lo = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let u_vec_hi = endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8); + let v_vec_lo = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); + let v_vec_hi = endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8); // Unsigned-widen + subtract 32768 in i32 (doesn't fit i16). let u_lo_a = vsubq_s32( @@ -810,10 +829,13 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit u8. + // 16-bit alpha is full-range u16 — byte-swap if BE, then + // `>> 8` to fit u8. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_u16 = vshrq_n_u16::<8>(vld1q_u16(a_ptr.add(x))); - let a_hi_u16 = vshrq_n_u16::<8>(vld1q_u16(a_ptr.add(x + 8))); + let a_lo_raw = endian::load_endian_u16x8::(a_ptr.add(x) as *const u8); + let a_hi_raw = endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8); + let a_lo_u16 = vshrq_n_u16::<8>(a_lo_raw); + let a_hi_u16 = vshrq_n_u16::<8>(a_hi_raw); vcombine_u8(vqmovn_u16(a_lo_u16), vqmovn_u16(a_hi_u16)) } else { alpha_u8 @@ -837,15 +859,17 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_row( + scalar::yuv_444p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -862,7 +886,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -873,7 +897,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -890,7 +914,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -901,7 +925,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -921,7 +945,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -933,7 +957,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -965,7 +989,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -1011,9 +1039,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row(y.as_ptr().add(x) as *const u8); + let u_vec = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let v_vec = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); let u_lo_i32 = vsubq_s32( vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(u_vec))), @@ -1069,9 +1097,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8, + ) } else { alpha_u16 }; @@ -1097,15 +1127,15 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_u16_row( + scalar::yuv_444p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_u16_row( + scalar::yuv_444p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/neon/yuv_planar_high_bit.rs b/src/row/arch/neon/yuv_planar_high_bit.rs index 102449fe..40dcb920 100644 --- a/src/row/arch/neon/yuv_planar_high_bit.rs +++ b/src/row/arch/neon/yuv_planar_high_bit.rs @@ -41,7 +41,7 @@ use super::*; /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -52,7 +52,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -71,7 +71,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( /// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -82,7 +82,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -102,7 +102,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -114,7 +114,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -153,6 +153,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -208,10 +209,22 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // out‑of‑range samples (e.g. high‑bit‑packed data handed to // the low‑packed kernel) can never push an intermediate past // i16 range. For valid input the AND is a no‑op. - let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); - let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); - let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); - let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v); + let y_vec_lo = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_vec_hi = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = vandq_u16( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = vandq_u16( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -266,8 +279,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert above. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_u16 = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v); - let a_hi_u16 = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v); + let a_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. NEON's // `vshrq_n_u16` requires a literal const generic shift, but @@ -301,15 +320,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_row::( + scalar::yuv_420p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_row::( + scalar::yuv_420p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -346,7 +365,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< /// 4. `BITS` must be one of `{10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -356,7 +375,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -372,7 +391,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( /// `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -382,7 +401,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -403,7 +422,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -415,7 +434,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -454,6 +473,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -509,10 +529,22 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // AND‑mask each load to the low BITS bits so intermediates // stay within the i16 range the Q15 narrow steps expect — see // matching comment in [`yuv_420p_n_to_rgb_row`]. - let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); - let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); - let u_vec = vandq_u16(vld1q_u16(u_half.as_ptr().add(x / 2)), mask_v); - let v_vec = vandq_u16(vld1q_u16(v_half.as_ptr().add(x / 2)), mask_v); + let y_vec_lo = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_vec_hi = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = vandq_u16( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = vandq_u16( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -564,8 +596,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // at the same native bit depth (BITS), so just mask off any // over-range bits to match the scalar reference. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v); - let hi = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v); + let lo = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -594,15 +632,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_u16_row::( + scalar::yuv_420p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -622,7 +660,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// 1. **NEON must be available.** 2. `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -633,7 +671,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -652,7 +690,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -663,7 +701,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -683,7 +721,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -695,7 +733,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -731,6 +769,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -777,12 +816,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 16 <= width { // 16 Y + 16 U + 16 V per iter, loaded as two u16x8 halves each. - let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); - let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); - let u_lo_u16 = vandq_u16(vld1q_u16(u.as_ptr().add(x)), mask_v); - let u_hi_u16 = vandq_u16(vld1q_u16(u.as_ptr().add(x + 8)), mask_v); - let v_lo_u16 = vandq_u16(vld1q_u16(v.as_ptr().add(x)), mask_v); - let v_hi_u16 = vandq_u16(vld1q_u16(v.as_ptr().add(x + 8)), mask_v); + let y_vec_lo = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_vec_hi = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -840,8 +897,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert above. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_u16 = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v); - let a_hi_u16 = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v); + let a_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. NEON's // `vshrq_n_u16` requires a literal const generic shift, but @@ -875,15 +938,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_row::( + scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_row::( + scalar::yuv_444p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -902,7 +965,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -913,7 +976,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -932,7 +995,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -943,7 +1006,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -964,7 +1027,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "neon")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -976,7 +1039,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -1014,6 +1077,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -1065,12 +1129,30 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 16 <= width { - let y_vec_lo = vandq_u16(vld1q_u16(y.as_ptr().add(x)), mask_v); - let y_vec_hi = vandq_u16(vld1q_u16(y.as_ptr().add(x + 8)), mask_v); - let u_lo_u16 = vandq_u16(vld1q_u16(u.as_ptr().add(x)), mask_v); - let u_hi_u16 = vandq_u16(vld1q_u16(u.as_ptr().add(x + 8)), mask_v); - let v_lo_u16 = vandq_u16(vld1q_u16(v.as_ptr().add(x)), mask_v); - let v_hi_u16 = vandq_u16(vld1q_u16(v.as_ptr().add(x + 8)), mask_v); + let y_vec_lo = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_vec_hi = vandq_u16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_u16 = vandq_u16( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_u16 = vandq_u16( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let y_lo = vreinterpretq_s16_u16(y_vec_lo); let y_hi = vreinterpretq_s16_u16(y_vec_hi); @@ -1123,8 +1205,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< // at the same native bit depth (BITS), so just mask off any // over-range bits to match the scalar reference. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = vandq_u16(vld1q_u16(a_ptr.add(x)), mask_v); - let hi = vandq_u16(vld1q_u16(a_ptr.add(x + 8)), mask_v); + let lo = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = vandq_u16( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -1152,15 +1240,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_u16_row::( + scalar::yuv_444p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs index 534bfd64..1f8b7526 100644 --- a/src/row/dispatch/pn.rs +++ b/src/row/dispatch/pn.rs @@ -68,7 +68,7 @@ pub(crate) fn p_n_444_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -77,21 +77,21 @@ pub(crate) fn p_n_444_to_rgb_row( if avx512_available() { // SAFETY: AVX-512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -100,7 +100,7 @@ pub(crate) fn p_n_444_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile-time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -109,7 +109,7 @@ pub(crate) fn p_n_444_to_rgb_row( } } - scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16** @@ -139,7 +139,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -147,19 +147,19 @@ pub(crate) fn p_n_444_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -167,7 +167,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -176,7 +176,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( } } - scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher. @@ -207,7 +207,7 @@ pub fn p416_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -215,19 +215,19 @@ pub fn p416_to_rgb_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -235,7 +235,7 @@ pub fn p416_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -244,7 +244,7 @@ pub fn p416_to_rgb_row( } } - scalar::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma @@ -272,7 +272,7 @@ pub fn p416_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -280,19 +280,19 @@ pub fn p416_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -300,7 +300,7 @@ pub fn p416_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -309,7 +309,7 @@ pub fn p416_to_rgb_u16_row( } } - scalar::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`. @@ -399,7 +399,7 @@ pub fn p410_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -408,21 +408,21 @@ pub fn p410_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -431,7 +431,7 @@ pub fn p410_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -440,7 +440,7 @@ pub fn p410_to_rgba_row( } } - scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } /// P410 → **native-depth `u16`** packed **RGBA** — output is @@ -470,7 +470,7 @@ pub fn p410_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -479,21 +479,21 @@ pub fn p410_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -502,7 +502,7 @@ pub fn p410_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -511,7 +511,7 @@ pub fn p410_to_rgba_u16_row( } } - scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); } /// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit** @@ -541,7 +541,7 @@ pub fn p412_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -550,21 +550,21 @@ pub fn p412_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -573,7 +573,7 @@ pub fn p412_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -582,7 +582,7 @@ pub fn p412_to_rgba_row( } } - scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } /// P412 → **native-depth `u16`** packed **RGBA** — output is @@ -612,7 +612,7 @@ pub fn p412_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -621,21 +621,21 @@ pub fn p412_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -644,7 +644,7 @@ pub fn p412_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -653,7 +653,7 @@ pub fn p412_to_rgba_u16_row( } } - scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); } /// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA** @@ -684,7 +684,7 @@ pub fn p416_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -693,21 +693,21 @@ pub fn p416_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -716,7 +716,7 @@ pub fn p416_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -725,7 +725,7 @@ pub fn p416_to_rgba_row( } } - scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } /// P416 → **native-depth `u16`** packed **RGBA** — full-range output @@ -757,7 +757,7 @@ pub fn p416_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::neon::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -766,21 +766,21 @@ pub fn p416_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -789,7 +789,7 @@ pub fn p416_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } return; } @@ -798,5 +798,5 @@ pub fn p416_to_rgba_u16_row( } } - scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs index ba9d95b8..e9912e75 100644 --- a/src/row/dispatch/yuv420/p010.rs +++ b/src/row/dispatch/yuv420/p010.rs @@ -24,7 +24,7 @@ use crate::{ /// /// This is the HDR hardware‑decode keystone format: VideoToolbox, /// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit -/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic +/// output. See `scalar::p_n_to_rgb_row::<10, false>` for the full semantic /// specification. `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -49,7 +49,7 @@ pub fn p010_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -58,21 +58,21 @@ pub fn p010_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -81,7 +81,7 @@ pub fn p010_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -90,7 +90,7 @@ pub fn p010_to_rgb_row( } } - scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P010** to **native‑depth `u16`** packed RGB @@ -99,7 +99,7 @@ pub fn p010_to_rgb_row( /// Callers feeding this output into a P010 consumer must shift left /// by 6. /// -/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. +/// See `scalar::p_n_to_rgb_u16_row::<10, false>` for the full spec. /// `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -124,7 +124,7 @@ pub fn p010_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -133,21 +133,21 @@ pub fn p010_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -156,7 +156,7 @@ pub fn p010_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( + arch::wasm_simd128::p_n_to_rgb_u16_row::<10, false>( y, uv_half, rgb_out, width, matrix, full_range, ); } @@ -167,14 +167,14 @@ pub fn p010_to_rgb_u16_row( } } - scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). /// -/// See `scalar::p_n_to_rgba_row::<10>` for the reference. +/// See `scalar::p_n_to_rgba_row::<10, false>` for the reference. /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -199,7 +199,7 @@ pub fn p010_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -208,21 +208,21 @@ pub fn p010_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -231,7 +231,7 @@ pub fn p010_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -240,14 +240,14 @@ pub fn p010_to_rgba_row( } } - scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output /// is low-bit-packed; alpha element is `(1 << 10) - 1`. /// -/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. +/// See `scalar::p_n_to_rgba_u16_row::<10, false>` for the reference. /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -272,7 +272,7 @@ pub fn p010_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -281,21 +281,21 @@ pub fn p010_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -304,7 +304,7 @@ pub fn p010_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -313,5 +313,5 @@ pub fn p010_to_rgba_u16_row( } } - scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs index ef1c1301..a91ac9b0 100644 --- a/src/row/dispatch/yuv420/p012.rs +++ b/src/row/dispatch/yuv420/p012.rs @@ -47,7 +47,7 @@ pub fn p012_to_rgb_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -55,19 +55,19 @@ pub fn p012_to_rgb_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -75,7 +75,7 @@ pub fn p012_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -84,7 +84,7 @@ pub fn p012_to_rgb_row( } } - scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P012** to **native‑depth `u16`** packed RGB @@ -112,7 +112,7 @@ pub fn p012_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -120,19 +120,19 @@ pub fn p012_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -140,7 +140,7 @@ pub fn p012_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( + arch::wasm_simd128::p_n_to_rgb_u16_row::<12, false>( y, uv_half, rgb_out, width, matrix, full_range, ); } @@ -151,14 +151,14 @@ pub fn p012_to_rgb_u16_row( } } - scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). /// -/// See `scalar::p_n_to_rgba_row::<12>` for the reference. +/// See `scalar::p_n_to_rgba_row::<12, false>` for the reference. /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -183,7 +183,7 @@ pub fn p012_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -192,21 +192,21 @@ pub fn p012_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -215,7 +215,7 @@ pub fn p012_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -224,14 +224,14 @@ pub fn p012_to_rgba_row( } } - scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, /// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output /// is low-bit-packed; alpha element is `(1 << 12) - 1`. /// -/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. +/// See `scalar::p_n_to_rgba_u16_row::<12, false>` for the reference. /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -256,7 +256,7 @@ pub fn p012_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -265,21 +265,21 @@ pub fn p012_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -288,7 +288,7 @@ pub fn p012_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -297,5 +297,5 @@ pub fn p012_to_rgba_u16_row( } } - scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs index abdf59d1..d21bc556 100644 --- a/src/row/dispatch/yuv420/p016.rs +++ b/src/row/dispatch/yuv420/p016.rs @@ -42,7 +42,7 @@ pub fn p016_to_rgb_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -50,19 +50,19 @@ pub fn p016_to_rgb_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -70,7 +70,7 @@ pub fn p016_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -79,7 +79,7 @@ pub fn p016_to_rgb_row( } } - scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P016** to **native-depth `u16`** packed RGB @@ -106,7 +106,7 @@ pub fn p016_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::neon::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -114,19 +114,19 @@ pub fn p016_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -134,7 +134,7 @@ pub fn p016_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } return; } @@ -143,7 +143,7 @@ pub fn p016_to_rgb_u16_row( } } - scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + scalar::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit @@ -174,7 +174,7 @@ pub fn p016_to_rgba_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -182,19 +182,19 @@ pub fn p016_to_rgba_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -202,7 +202,7 @@ pub fn p016_to_rgba_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -211,7 +211,7 @@ pub fn p016_to_rgba_row( } } - scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } /// Converts one row of **P016** to **native-depth `u16`** packed @@ -243,7 +243,7 @@ pub fn p016_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::neon::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -251,19 +251,19 @@ pub fn p016_to_rgba_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -271,7 +271,7 @@ pub fn p016_to_rgba_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } return; } @@ -280,5 +280,5 @@ pub fn p016_to_rgba_u16_row( } } - scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 349d0623..103c603e 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -53,7 +53,7 @@ pub fn yuv420p10_to_rgb_row( // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -62,7 +62,7 @@ pub fn yuv420p10_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -71,7 +71,7 @@ pub fn yuv420p10_to_rgb_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -80,7 +80,7 @@ pub fn yuv420p10_to_rgb_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -91,7 +91,7 @@ pub fn yuv420p10_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -102,7 +102,7 @@ pub fn yuv420p10_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed @@ -147,7 +147,7 @@ pub fn yuv420p10_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( + arch::neon::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -158,7 +158,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -167,7 +167,7 @@ pub fn yuv420p10_to_rgb_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -176,7 +176,7 @@ pub fn yuv420p10_to_rgb_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -187,7 +187,7 @@ pub fn yuv420p10_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -198,7 +198,7 @@ pub fn yuv420p10_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** @@ -235,7 +235,7 @@ pub fn yuv420p10_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -244,7 +244,7 @@ pub fn yuv420p10_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -253,7 +253,7 @@ pub fn yuv420p10_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -262,7 +262,7 @@ pub fn yuv420p10_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -273,7 +273,7 @@ pub fn yuv420p10_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -284,7 +284,7 @@ pub fn yuv420p10_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } /// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** @@ -319,7 +319,7 @@ pub fn yuv420p10_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_u16_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -328,7 +328,7 @@ pub fn yuv420p10_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -337,7 +337,7 @@ pub fn yuv420p10_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -346,7 +346,7 @@ pub fn yuv420p10_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -357,7 +357,7 @@ pub fn yuv420p10_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -368,5 +368,5 @@ pub fn yuv420p10_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index 3b503b74..2e0dec5c 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -48,7 +48,7 @@ pub fn yuv420p12_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -57,7 +57,7 @@ pub fn yuv420p12_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -66,7 +66,7 @@ pub fn yuv420p12_to_rgb_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -75,7 +75,7 @@ pub fn yuv420p12_to_rgb_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -86,7 +86,7 @@ pub fn yuv420p12_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -97,7 +97,7 @@ pub fn yuv420p12_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed @@ -127,7 +127,7 @@ pub fn yuv420p12_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( + arch::neon::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -137,7 +137,7 @@ pub fn yuv420p12_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -145,7 +145,7 @@ pub fn yuv420p12_to_rgb_u16_row( } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -153,7 +153,7 @@ pub fn yuv420p12_to_rgb_u16_row( } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -163,7 +163,7 @@ pub fn yuv420p12_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -174,7 +174,7 @@ pub fn yuv420p12_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** @@ -211,7 +211,7 @@ pub fn yuv420p12_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -220,7 +220,7 @@ pub fn yuv420p12_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgba_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -229,7 +229,7 @@ pub fn yuv420p12_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgba_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -238,7 +238,7 @@ pub fn yuv420p12_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgba_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -249,7 +249,7 @@ pub fn yuv420p12_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -260,7 +260,7 @@ pub fn yuv420p12_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } /// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** @@ -295,7 +295,7 @@ pub fn yuv420p12_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_u16_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -304,7 +304,7 @@ pub fn yuv420p12_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -313,7 +313,7 @@ pub fn yuv420p12_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -322,7 +322,7 @@ pub fn yuv420p12_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -333,7 +333,7 @@ pub fn yuv420p12_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -344,5 +344,5 @@ pub fn yuv420p12_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index 50427e59..61f6e59a 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -42,7 +42,7 @@ pub fn yuv420p14_to_rgb_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -50,7 +50,7 @@ pub fn yuv420p14_to_rgb_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -58,7 +58,7 @@ pub fn yuv420p14_to_rgb_row( } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -66,7 +66,7 @@ pub fn yuv420p14_to_rgb_row( } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -76,7 +76,7 @@ pub fn yuv420p14_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -87,7 +87,7 @@ pub fn yuv420p14_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed @@ -116,7 +116,7 @@ pub fn yuv420p14_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( + arch::neon::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -126,7 +126,7 @@ pub fn yuv420p14_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -134,7 +134,7 @@ pub fn yuv420p14_to_rgb_u16_row( } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -142,7 +142,7 @@ pub fn yuv420p14_to_rgb_u16_row( } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -152,7 +152,7 @@ pub fn yuv420p14_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -163,7 +163,7 @@ pub fn yuv420p14_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** @@ -200,7 +200,7 @@ pub fn yuv420p14_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -209,7 +209,7 @@ pub fn yuv420p14_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( + arch::x86_avx512::yuv_420p_n_to_rgba_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -218,7 +218,7 @@ pub fn yuv420p14_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( + arch::x86_avx2::yuv_420p_n_to_rgba_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -227,7 +227,7 @@ pub fn yuv420p14_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( + arch::x86_sse41::yuv_420p_n_to_rgba_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -238,7 +238,7 @@ pub fn yuv420p14_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -249,7 +249,7 @@ pub fn yuv420p14_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } /// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** @@ -284,7 +284,7 @@ pub fn yuv420p14_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_u16_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -293,7 +293,7 @@ pub fn yuv420p14_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -302,7 +302,7 @@ pub fn yuv420p14_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -311,7 +311,7 @@ pub fn yuv420p14_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -322,7 +322,7 @@ pub fn yuv420p14_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -333,5 +333,5 @@ pub fn yuv420p14_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index c681c48b..75690ee5 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -46,7 +46,7 @@ pub fn yuv420p16_to_rgb_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -54,19 +54,19 @@ pub fn yuv420p16_to_rgb_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -74,7 +74,7 @@ pub fn yuv420p16_to_rgb_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -83,7 +83,7 @@ pub fn yuv420p16_to_rgb_row( } } - scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** @@ -112,7 +112,7 @@ pub fn yuv420p16_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -120,19 +120,19 @@ pub fn yuv420p16_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -140,7 +140,7 @@ pub fn yuv420p16_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -149,7 +149,7 @@ pub fn yuv420p16_to_rgb_u16_row( } } - scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** @@ -183,7 +183,7 @@ pub fn yuv420p16_to_rgba_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -191,19 +191,19 @@ pub fn yuv420p16_to_rgba_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -211,7 +211,7 @@ pub fn yuv420p16_to_rgba_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -220,7 +220,7 @@ pub fn yuv420p16_to_rgba_row( } } - scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** @@ -255,7 +255,7 @@ pub fn yuv420p16_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { unsafe { - arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -263,19 +263,19 @@ pub fn yuv420p16_to_rgba_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -283,7 +283,7 @@ pub fn yuv420p16_to_rgba_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -292,5 +292,5 @@ pub fn yuv420p16_to_rgba_u16_row( } } - scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index 09cb0156..d4867753 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -55,7 +55,7 @@ pub fn yuv420p9_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgb_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } return; } @@ -64,7 +64,7 @@ pub fn yuv420p9_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -73,7 +73,7 @@ pub fn yuv420p9_to_rgb_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -82,7 +82,7 @@ pub fn yuv420p9_to_rgb_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -93,7 +93,7 @@ pub fn yuv420p9_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -104,7 +104,7 @@ pub fn yuv420p9_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } /// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed @@ -134,7 +134,7 @@ pub fn yuv420p9_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<9>( + arch::neon::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -145,7 +145,7 @@ pub fn yuv420p9_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -154,7 +154,7 @@ pub fn yuv420p9_to_rgb_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -163,7 +163,7 @@ pub fn yuv420p9_to_rgb_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -174,7 +174,7 @@ pub fn yuv420p9_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, ); } @@ -185,7 +185,7 @@ pub fn yuv420p9_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); } // ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- @@ -229,7 +229,7 @@ pub fn yuv420p9_to_rgba_row( // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -238,7 +238,7 @@ pub fn yuv420p9_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgba_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -247,7 +247,7 @@ pub fn yuv420p9_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgba_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -256,7 +256,7 @@ pub fn yuv420p9_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgba_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -267,7 +267,7 @@ pub fn yuv420p9_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -278,7 +278,7 @@ pub fn yuv420p9_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } /// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** @@ -313,7 +313,7 @@ pub fn yuv420p9_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + arch::neon::yuv_420p_n_to_rgba_u16_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } return; } @@ -322,7 +322,7 @@ pub fn yuv420p9_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -331,7 +331,7 @@ pub fn yuv420p9_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -340,7 +340,7 @@ pub fn yuv420p9_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -351,7 +351,7 @@ pub fn yuv420p9_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, ); } @@ -362,5 +362,5 @@ pub fn yuv420p9_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs index 01ca3861..00b78660 100644 --- a/src/row/dispatch/yuv444/mod.rs +++ b/src/row/dispatch/yuv444/mod.rs @@ -67,7 +67,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -76,21 +76,21 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -99,7 +99,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -108,7 +108,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( } } - scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } /// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. @@ -144,7 +144,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -153,21 +153,21 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -176,7 +176,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -185,7 +185,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( } } - scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } pub(super) mod yuv444p10; diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs index b6836e8e..245c765a 100644 --- a/src/row/dispatch/yuv444/yuv444p10.rs +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -20,7 +20,7 @@ use crate::{ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; /// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<10>`. +/// crate-internal `yuv_444p_n_to_rgb_row::<10, false>`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv444p10_to_rgb_row( @@ -80,7 +80,7 @@ pub fn yuv444p10_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -89,21 +89,21 @@ pub fn yuv444p10_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -112,7 +112,7 @@ pub fn yuv444p10_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -121,7 +121,7 @@ pub fn yuv444p10_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } /// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** @@ -153,7 +153,7 @@ pub fn yuv444p10_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -162,21 +162,21 @@ pub fn yuv444p10_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -185,7 +185,7 @@ pub fn yuv444p10_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -194,5 +194,5 @@ pub fn yuv444p10_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs index c4f3e0f4..2eec3e85 100644 --- a/src/row/dispatch/yuv444/yuv444p12.rs +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -79,7 +79,7 @@ pub fn yuv444p12_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -88,21 +88,21 @@ pub fn yuv444p12_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -111,7 +111,7 @@ pub fn yuv444p12_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -120,7 +120,7 @@ pub fn yuv444p12_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } /// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** @@ -152,7 +152,7 @@ pub fn yuv444p12_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -161,21 +161,21 @@ pub fn yuv444p12_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -184,7 +184,7 @@ pub fn yuv444p12_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -193,5 +193,5 @@ pub fn yuv444p12_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs index 8b7b7e7b..0d6f7104 100644 --- a/src/row/dispatch/yuv444/yuv444p14.rs +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -79,7 +79,7 @@ pub fn yuv444p14_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -88,21 +88,21 @@ pub fn yuv444p14_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -111,7 +111,7 @@ pub fn yuv444p14_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -120,7 +120,7 @@ pub fn yuv444p14_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } /// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** @@ -152,7 +152,7 @@ pub fn yuv444p14_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -161,21 +161,21 @@ pub fn yuv444p14_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -184,7 +184,7 @@ pub fn yuv444p14_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -193,5 +193,5 @@ pub fn yuv444p14_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs index 87d69fc9..c32ad5ab 100644 --- a/src/row/dispatch/yuv444/yuv444p16.rs +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -46,7 +46,7 @@ pub fn yuv444p16_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -55,21 +55,21 @@ pub fn yuv444p16_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -78,7 +78,7 @@ pub fn yuv444p16_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -87,7 +87,7 @@ pub fn yuv444p16_to_rgb_row( } } - scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } /// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range @@ -117,7 +117,7 @@ pub fn yuv444p16_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -126,21 +126,21 @@ pub fn yuv444p16_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -149,7 +149,7 @@ pub fn yuv444p16_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -158,7 +158,7 @@ pub fn yuv444p16_to_rgb_u16_row( } } - scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** @@ -190,7 +190,7 @@ pub fn yuv444p16_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -199,21 +199,21 @@ pub fn yuv444p16_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -222,7 +222,7 @@ pub fn yuv444p16_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -231,7 +231,7 @@ pub fn yuv444p16_to_rgba_row( } } - scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** @@ -264,7 +264,7 @@ pub fn yuv444p16_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -273,21 +273,21 @@ pub fn yuv444p16_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -296,7 +296,7 @@ pub fn yuv444p16_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -305,5 +305,5 @@ pub fn yuv444p16_to_rgba_u16_row( } } - scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs index 784ed036..04bb0a5e 100644 --- a/src/row/dispatch/yuv444/yuv444p9.rs +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -24,7 +24,7 @@ use crate::{ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; /// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<9>`. +/// crate-internal `yuv_444p_n_to_rgb_row::<9, false>`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv444p9_to_rgb_row( @@ -95,7 +95,7 @@ pub fn yuv444p9_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -104,21 +104,21 @@ pub fn yuv444p9_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -127,7 +127,7 @@ pub fn yuv444p9_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -136,7 +136,7 @@ pub fn yuv444p9_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } /// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** @@ -170,7 +170,7 @@ pub fn yuv444p9_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -179,21 +179,21 @@ pub fn yuv444p9_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -202,7 +202,7 @@ pub fn yuv444p9_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } return; } @@ -211,5 +211,5 @@ pub fn yuv444p9_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); } diff --git a/src/row/dispatch/yuva/sub_4_2_0.rs b/src/row/dispatch/yuva/sub_4_2_0.rs index c2628a99..b1e0840b 100644 --- a/src/row/dispatch/yuva/sub_4_2_0.rs +++ b/src/row/dispatch/yuva/sub_4_2_0.rs @@ -151,7 +151,7 @@ pub fn yuva420p9_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -162,7 +162,7 @@ pub fn yuva420p9_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -171,7 +171,7 @@ pub fn yuva420p9_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -180,7 +180,7 @@ pub fn yuva420p9_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -191,7 +191,7 @@ pub fn yuva420p9_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -202,7 +202,7 @@ pub fn yuva420p9_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -242,7 +242,7 @@ pub fn yuva420p9_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -253,7 +253,7 @@ pub fn yuva420p9_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -262,7 +262,7 @@ pub fn yuva420p9_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -271,7 +271,7 @@ pub fn yuva420p9_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -282,7 +282,7 @@ pub fn yuva420p9_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -293,7 +293,7 @@ pub fn yuva420p9_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -333,7 +333,7 @@ pub fn yuva420p10_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -344,7 +344,7 @@ pub fn yuva420p10_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -353,7 +353,7 @@ pub fn yuva420p10_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -362,7 +362,7 @@ pub fn yuva420p10_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -373,7 +373,7 @@ pub fn yuva420p10_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -384,7 +384,7 @@ pub fn yuva420p10_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -422,7 +422,7 @@ pub fn yuva420p10_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -433,7 +433,7 @@ pub fn yuva420p10_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -442,7 +442,7 @@ pub fn yuva420p10_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -451,7 +451,7 @@ pub fn yuva420p10_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -462,7 +462,7 @@ pub fn yuva420p10_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -473,7 +473,7 @@ pub fn yuva420p10_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -513,7 +513,7 @@ pub fn yuva420p12_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -524,7 +524,7 @@ pub fn yuva420p12_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -533,7 +533,7 @@ pub fn yuva420p12_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -542,7 +542,7 @@ pub fn yuva420p12_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -553,7 +553,7 @@ pub fn yuva420p12_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -564,7 +564,7 @@ pub fn yuva420p12_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -602,7 +602,7 @@ pub fn yuva420p12_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -613,7 +613,7 @@ pub fn yuva420p12_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -622,7 +622,7 @@ pub fn yuva420p12_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -631,7 +631,7 @@ pub fn yuva420p12_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -642,7 +642,7 @@ pub fn yuva420p12_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -653,7 +653,7 @@ pub fn yuva420p12_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -692,7 +692,7 @@ pub fn yuva420p16_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p16_to_rgba_with_alpha_src_row( + arch::neon::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -703,7 +703,7 @@ pub fn yuva420p16_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row( + arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -712,7 +712,7 @@ pub fn yuva420p16_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row( + arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -721,7 +721,7 @@ pub fn yuva420p16_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row( + arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -732,7 +732,7 @@ pub fn yuva420p16_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row( + arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -743,7 +743,7 @@ pub fn yuva420p16_to_rgba_row( } } - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -781,7 +781,7 @@ pub fn yuva420p16_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row( + arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -792,7 +792,7 @@ pub fn yuva420p16_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row( + arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -801,7 +801,7 @@ pub fn yuva420p16_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row( + arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -810,7 +810,7 @@ pub fn yuva420p16_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row( + arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -821,7 +821,7 @@ pub fn yuva420p16_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row( + arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } @@ -832,7 +832,7 @@ pub fn yuva420p16_to_rgba_u16_row( } } - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); } diff --git a/src/row/dispatch/yuva/sub_4_4_4.rs b/src/row/dispatch/yuva/sub_4_4_4.rs index 6ec7a2bb..4eb17490 100644 --- a/src/row/dispatch/yuva/sub_4_4_4.rs +++ b/src/row/dispatch/yuva/sub_4_4_4.rs @@ -145,7 +145,7 @@ pub fn yuva444p9_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -156,7 +156,7 @@ pub fn yuva444p9_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -165,7 +165,7 @@ pub fn yuva444p9_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -174,7 +174,7 @@ pub fn yuva444p9_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -185,7 +185,7 @@ pub fn yuva444p9_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -196,7 +196,7 @@ pub fn yuva444p9_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -235,7 +235,7 @@ pub fn yuva444p9_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -246,7 +246,7 @@ pub fn yuva444p9_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -255,7 +255,7 @@ pub fn yuva444p9_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -264,7 +264,7 @@ pub fn yuva444p9_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -275,7 +275,7 @@ pub fn yuva444p9_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -286,7 +286,7 @@ pub fn yuva444p9_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -325,7 +325,7 @@ pub fn yuva444p10_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -336,7 +336,7 @@ pub fn yuva444p10_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -345,7 +345,7 @@ pub fn yuva444p10_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -354,7 +354,7 @@ pub fn yuva444p10_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -365,7 +365,7 @@ pub fn yuva444p10_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -376,7 +376,7 @@ pub fn yuva444p10_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -415,7 +415,7 @@ pub fn yuva444p10_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -426,7 +426,7 @@ pub fn yuva444p10_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -435,7 +435,7 @@ pub fn yuva444p10_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -444,7 +444,7 @@ pub fn yuva444p10_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -455,7 +455,7 @@ pub fn yuva444p10_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -466,7 +466,7 @@ pub fn yuva444p10_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -505,7 +505,7 @@ pub fn yuva444p12_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -516,7 +516,7 @@ pub fn yuva444p12_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -525,7 +525,7 @@ pub fn yuva444p12_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -534,7 +534,7 @@ pub fn yuva444p12_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -545,7 +545,7 @@ pub fn yuva444p12_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -556,7 +556,7 @@ pub fn yuva444p12_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -595,7 +595,7 @@ pub fn yuva444p12_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -606,7 +606,7 @@ pub fn yuva444p12_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -615,7 +615,7 @@ pub fn yuva444p12_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -624,7 +624,7 @@ pub fn yuva444p12_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -635,7 +635,7 @@ pub fn yuva444p12_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -646,7 +646,7 @@ pub fn yuva444p12_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -684,7 +684,7 @@ pub fn yuva444p14_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -695,7 +695,7 @@ pub fn yuva444p14_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -704,7 +704,7 @@ pub fn yuva444p14_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -713,7 +713,7 @@ pub fn yuva444p14_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -724,7 +724,7 @@ pub fn yuva444p14_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -735,7 +735,7 @@ pub fn yuva444p14_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -774,7 +774,7 @@ pub fn yuva444p14_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -785,7 +785,7 @@ pub fn yuva444p14_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -794,7 +794,7 @@ pub fn yuva444p14_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -803,7 +803,7 @@ pub fn yuva444p14_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -814,7 +814,7 @@ pub fn yuva444p14_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -825,7 +825,7 @@ pub fn yuva444p14_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -873,7 +873,7 @@ pub fn yuva444p16_to_rgba_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgba_with_alpha_src_row( + arch::neon::yuv_444p16_to_rgba_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -884,7 +884,7 @@ pub fn yuva444p16_to_rgba_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_with_alpha_src_row( + arch::x86_avx512::yuv_444p16_to_rgba_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -893,7 +893,7 @@ pub fn yuva444p16_to_rgba_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_with_alpha_src_row( + arch::x86_avx2::yuv_444p16_to_rgba_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -902,7 +902,7 @@ pub fn yuva444p16_to_rgba_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_with_alpha_src_row( + arch::x86_sse41::yuv_444p16_to_rgba_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -913,7 +913,7 @@ pub fn yuva444p16_to_rgba_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_with_alpha_src_row( + arch::wasm_simd128::yuv_444p16_to_rgba_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -924,7 +924,7 @@ pub fn yuva444p16_to_rgba_row( } } - scalar::yuv_444p16_to_rgba_with_alpha_src_row(y, u, v, a, rgba_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } /// Converts one row of **16-bit** YUVA 4:4:4 to **native-depth `u16`** @@ -960,7 +960,7 @@ pub fn yuva444p16_to_rgba_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p16_to_rgba_u16_with_alpha_src_row( + arch::neon::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -971,7 +971,7 @@ pub fn yuva444p16_to_rgba_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_with_alpha_src_row( + arch::x86_avx512::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -980,7 +980,7 @@ pub fn yuva444p16_to_rgba_u16_row( if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_with_alpha_src_row( + arch::x86_avx2::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -989,7 +989,7 @@ pub fn yuva444p16_to_rgba_u16_row( if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_with_alpha_src_row( + arch::x86_sse41::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -1000,7 +1000,7 @@ pub fn yuva444p16_to_rgba_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_with_alpha_src_row( + arch::wasm_simd128::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } @@ -1011,7 +1011,7 @@ pub fn yuva444p16_to_rgba_u16_row( } } - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y, u, v, a, rgba_out, width, matrix, full_range, ); } diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs index 5c8cb66c..4212e48b 100644 --- a/src/row/scalar/mod.rs +++ b/src/row/scalar/mod.rs @@ -181,6 +181,17 @@ pub(super) fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 } +/// Byte-swap a `u16` sample when `BE = true`; identity when `BE = false`. +/// +/// Used by BE-aware scalar kernels to normalize big-endian `u16` plane +/// elements to host-native order at load time. The `if BE` branch is +/// dead-code-eliminated by the compiler for each monomorphization, so +/// the LE path (`BE = false`) is a zero-overhead no-op. +#[cfg_attr(not(tarpaulin), inline(always))] +pub(super) const fn load_u16(v: u16) -> u16 { + if BE { v.swap_bytes() } else { v } +} + /// `(sample * scale_q15 + RND) >> 15`. With input masked to BITS, /// the `sample * scale` product cannot overflow i32 for any /// reasonable `OUT_BITS ≤ 16`, so plain arithmetic is sufficient. diff --git a/src/row/scalar/subsampled_high_bit_pn.rs b/src/row/scalar/subsampled_high_bit_pn.rs index 1b061190..d93cc2d3 100644 --- a/src/row/scalar/subsampled_high_bit_pn.rs +++ b/src/row/scalar/subsampled_high_bit_pn.rs @@ -1,4 +1,4 @@ -use super::*; +use super::{load_u16, *}; // ---- P010 (semi-planar 10-bit, high-bit-packed) → RGB ------------------ @@ -23,7 +23,7 @@ use super::*; /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgb_row( +pub(crate) fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -31,7 +31,7 @@ pub(crate) fn p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012) @@ -47,7 +47,7 @@ pub(crate) fn p_n_to_rgb_row( // caller. P016 has its own kernel family // ([`p16_to_rgb_or_rgba_row`]) — never routed here. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgba_row( +pub(crate) fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -55,7 +55,7 @@ pub(crate) fn p_n_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } /// Shared kernel for [`p_n_to_rgb_row`] (`ALPHA = false`, 3 bpp store) @@ -68,7 +68,7 @@ pub(crate) fn p_n_to_rgba_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgb_or_rgba_row( +pub(crate) fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -97,17 +97,18 @@ pub(crate) fn p_n_to_rgb_or_rgba_row( let shift = 16 - BITS; // Each `u16` load is converted to its `BITS`-bit sample with - // `>> (16 - BITS)` — 6 for P010, 4 for P012. Extracts the upper - // bits and leaves the result in `[0, (1 << BITS) - 1]`. If - // low-packed input (`yuv420p10le`, `yuv420p12le`) is handed to - // this kernel by mistake, the shift discards the active low bits - // rather than recovering the intended value. No hot-path cost: - // one shift per load. + // `>> (16 - BITS)` — 6 for P010, 4 for P012. The BE byte-swap is + // applied first (on the raw wire format), then the shift extracts the + // active upper bits: `load_u16::(sample) >> (16 - BITS)`. If + // low-packed input (`yuv420p10le`, `yuv420p12le`) is handed to this + // kernel by mistake, the shift discards the active low bits rather than + // recovering the intended value. No hot-path cost: one swap + one shift + // per load. let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> shift; - let v_sample = uv_half[c_idx * 2 + 1] >> shift; + let u_sample = load_u16::(uv_half[c_idx * 2]) >> shift; + let v_sample = load_u16::(uv_half[c_idx * 2 + 1]) >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -115,7 +116,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) >> shift) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -123,7 +124,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_row( out[x * bpp + 3] = 0xFF; } - let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); + let y1 = q15_scale((load_u16::(y[x + 1]) >> shift) as i32 - y_off, y_scale); out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); @@ -156,7 +157,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgb_u16_row( +pub(crate) fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -164,7 +165,7 @@ pub(crate) fn p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of high‑bit‑packed semi‑planar 4:2:0 (P010/P012) @@ -180,7 +181,7 @@ pub(crate) fn p_n_to_rgb_u16_row( // no caller. P016 has its own u16 kernel family // ([`p16_to_rgb_or_rgba_u16_row`]) — never routed here. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgba_u16_row( +pub(crate) fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -188,7 +189,7 @@ pub(crate) fn p_n_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } /// Shared kernel for [`p_n_to_rgb_u16_row`] (`ALPHA = false`, 3 bpp @@ -201,7 +202,7 @@ pub(crate) fn p_n_to_rgba_u16_row( /// - `y.len() >= width`, `uv_half.len() >= width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -231,8 +232,8 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2] >> shift; - let v_sample = uv_half[c_idx * 2 + 1] >> shift; + let u_sample = load_u16::(uv_half[c_idx * 2]) >> shift; + let v_sample = load_u16::(uv_half[c_idx * 2 + 1]) >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -240,7 +241,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) >> shift) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -248,7 +249,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( out[x * bpp + 3] = alpha_max; } - let y1 = q15_scale((y[x + 1] >> shift) as i32 - y_off, y_scale); + let y1 = q15_scale((load_u16::(y[x + 1]) >> shift) as i32 - y_off, y_scale); out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; @@ -281,7 +282,7 @@ pub(crate) fn p_n_to_rgb_or_rgba_u16_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgb_row( +pub(crate) fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -289,7 +290,7 @@ pub(crate) fn p_n_444_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Converts one row of high-bit-packed semi-planar 4:4:4 (P410, P412) @@ -304,7 +305,7 @@ pub(crate) fn p_n_444_to_rgb_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgba_row( +pub(crate) fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -312,7 +313,7 @@ pub(crate) fn p_n_444_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } /// Shared kernel for [`p_n_444_to_rgb_row`] (`ALPHA = false`, 3 bpp @@ -324,7 +325,7 @@ pub(crate) fn p_n_444_to_rgba_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgb_or_rgba_row( +pub(crate) fn p_n_444_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -345,8 +346,8 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_row( for x in 0..width { // 4:4:4: one UV pair per pixel — uv_full[x*2] = U, uv_full[x*2+1] = V. - let u_sample = uv_full[x * 2] >> shift; - let v_sample = uv_full[x * 2 + 1] >> shift; + let u_sample = load_u16::(uv_full[x * 2]) >> shift; + let v_sample = load_u16::(uv_full[x * 2 + 1]) >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -354,7 +355,7 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) >> shift) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -377,7 +378,7 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgb_u16_row( +pub(crate) fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -385,7 +386,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Converts one row of high-bit-packed semi-planar 4:4:4 (P410, P412) @@ -400,7 +401,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgba_u16_row( +pub(crate) fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -408,7 +409,7 @@ pub(crate) fn p_n_444_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } /// Shared kernel for [`p_n_444_to_rgb_u16_row`] (`ALPHA = false`, @@ -420,7 +421,7 @@ pub(crate) fn p_n_444_to_rgba_u16_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }` (`u16` elements). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -442,8 +443,8 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row let alpha_max: u16 = out_max as u16; for x in 0..width { - let u_sample = uv_full[x * 2] >> shift; - let v_sample = uv_full[x * 2 + 1] >> shift; + let u_sample = load_u16::(uv_full[x * 2]) >> shift; + let v_sample = load_u16::(uv_full[x * 2 + 1]) >> shift; let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -451,7 +452,7 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] >> shift) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) >> shift) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -472,7 +473,7 @@ pub(crate) fn p_n_444_to_rgb_or_rgba_u16_row /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgb_row( +pub(crate) fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -480,7 +481,7 @@ pub(crate) fn p_n_444_16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Converts one row of P416 to **8-bit** packed **RGBA**. Same @@ -490,7 +491,7 @@ pub(crate) fn p_n_444_16_to_rgb_row( /// /// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgba_row( +pub(crate) fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -498,13 +499,13 @@ pub(crate) fn p_n_444_16_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } /// Shared P416 → 8-bit RGB / RGBA kernel. `ALPHA = false` emits 3 bpp; /// `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -522,8 +523,8 @@ pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( let bias = chroma_bias::<16>(); for x in 0..width { - let u_sample = uv_full[x * 2]; - let v_sample = uv_full[x * 2 + 1]; + let u_sample = load_u16::(uv_full[x * 2]); + let v_sample = load_u16::(uv_full[x * 2 + 1]); let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -531,7 +532,7 @@ pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -554,7 +555,7 @@ pub(crate) fn p_n_444_16_to_rgb_or_rgba_row( /// - `y.len() >= width`, `uv_full.len() >= 2 * width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgb_u16_row( +pub(crate) fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -562,7 +563,7 @@ pub(crate) fn p_n_444_16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Converts one row of P416 to **native-depth `u16`** packed @@ -571,7 +572,7 @@ pub(crate) fn p_n_444_16_to_rgb_u16_row( /// /// Thin wrapper over [`p_n_444_16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgba_u16_row( +pub(crate) fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -579,7 +580,7 @@ pub(crate) fn p_n_444_16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } /// Shared P416 → native-depth `u16` RGB / RGBA kernel. `ALPHA = false` @@ -587,7 +588,7 @@ pub(crate) fn p_n_444_16_to_rgba_u16_row( /// alpha. Uses i64 chroma multiply (same rationale as /// [`p_n_444_16_to_rgb_u16_row`]). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -606,8 +607,8 @@ pub(crate) fn p_n_444_16_to_rgb_or_rgba_u16_row( let out_max: i32 = 0xFFFF; for x in 0..width { - let u_sample = uv_full[x * 2]; - let v_sample = uv_full[x * 2 + 1]; + let u_sample = load_u16::(uv_full[x * 2]); + let v_sample = load_u16::(uv_full[x * 2 + 1]); let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -615,7 +616,7 @@ pub(crate) fn p_n_444_16_to_rgb_or_rgba_u16_row( let g_chroma = q15_chroma64(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale64(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; diff --git a/src/row/scalar/tests.rs b/src/row/scalar/tests.rs index 2bf38a79..8bd87a71 100644 --- a/src/row/scalar/tests.rs +++ b/src/row/scalar/tests.rs @@ -246,7 +246,7 @@ fn yuv420p10_rgb_black_full_range() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); } @@ -257,7 +257,7 @@ fn yuv420p10_rgb_white_full_range() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); } @@ -268,7 +268,7 @@ fn yuv420p10_rgb_gray_is_gray() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); assert_eq!(r, g); @@ -284,7 +284,7 @@ fn yuv420p10_rgb_limited_range_black_and_white() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, false); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); @@ -298,7 +298,7 @@ fn yuv420p10_rgb_chroma_shared_across_pair() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); // Full-range 10→8 scale = 255/1023, so Y=200 → 50, Y=800 → 199.4 → 199. // Allow ±1 for Q15 rounding. assert!(rgb[0].abs_diff(50) <= 1, "got {}", rgb[0]); @@ -315,7 +315,7 @@ fn yuv420p10_rgb_u16_black_full_range() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u16; 12]; - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); } @@ -326,7 +326,7 @@ fn yuv420p10_rgb_u16_white_full_range() { let u = [512u16; 2]; let v = [512u16; 2]; let mut rgb = [0u16; 12]; - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); } @@ -337,7 +337,7 @@ fn yuv420p10_rgb_u16_limited_range_endpoints() { let u = [512u16; 1]; let v = [512u16; 1]; let mut rgb = [0u16; 6]; - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb, 2, ColorMatrix::Bt709, false); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb, 2, ColorMatrix::Bt709, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); } @@ -353,8 +353,8 @@ fn yuv420p10_rgb_u16_preserves_full_10bit_precision() { let v = [512u16; 1]; let mut rgb8 = [0u8; 6]; let mut rgb16 = [0u16; 6]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb8, 2, ColorMatrix::Bt601, true); - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb16, 2, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb8, 2, ColorMatrix::Bt601, true); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb16, 2, ColorMatrix::Bt601, true); assert_eq!(rgb8[0], rgb8[3]); assert_ne!(rgb16[0], rgb16[3]); } @@ -367,8 +367,8 @@ fn yuv420p10_bt709_ycgco_differ_for_chroma() { let v = [800u16; 1]; let mut bt709 = [0u8; 6]; let mut ycgco = [0u8; 6]; - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut bt709, 2, ColorMatrix::Bt709, true); - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut ycgco, 2, ColorMatrix::YCgCo, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut bt709, 2, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut ycgco, 2, ColorMatrix::YCgCo, true); let sad: i32 = bt709 .iter() .zip(ycgco.iter()) @@ -391,7 +391,7 @@ fn p010_rgb_black_full_range() { let y = [0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; // U0 V0 U1 V1 let mut rgb = [0u8; 12]; - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 0), "got {rgb:?}"); } @@ -401,7 +401,7 @@ fn p010_rgb_white_full_range() { let y = [0xFFC0u16; 4]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 255), "got {rgb:?}"); } @@ -411,7 +411,7 @@ fn p010_rgb_gray_is_gray() { let y = [0x8000u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u8; 12]; - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b) = (rgb[x * 3], rgb[x * 3 + 1], rgb[x * 3 + 2]); assert_eq!(r, g); @@ -427,7 +427,7 @@ fn p010_rgb_limited_range_endpoints() { let y = [0x1000u16, 0x1000, 0xEB00, 0xEB00]; let uv = [0x8000u16, 0x8000, 0x8000, 0x8000]; let mut rgb = [0u8; 12]; - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (0, 0, 0)); assert_eq!((rgb[6], rgb[7], rgb[8]), (255, 255, 255)); @@ -447,7 +447,7 @@ fn p010_matches_yuv420p10_when_shifted() { let mut rgb_p10 = [0u8; 12]; let mut rgb_p010 = [0u8; 12]; - yuv_420p_n_to_rgb_row::<10>( + yuv_420p_n_to_rgb_row::<10, false>( &y_p10, &u_p10, &v_p10, @@ -456,7 +456,7 @@ fn p010_matches_yuv420p10_when_shifted() { ColorMatrix::Bt709, true, ); - p_n_to_rgb_row::<10>( + p_n_to_rgb_row::<10, false>( &y_p010, &uv_p010, &mut rgb_p010, @@ -474,7 +474,7 @@ fn p010_rgb_u16_white_full_range() { let y = [0xFFC0u16; 4]; let uv = [0x8000u16; 4]; let mut rgb = [0u16; 12]; - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb, 4, ColorMatrix::Bt601, true); assert!(rgb.iter().all(|&c| c == 1023), "got {rgb:?}"); } @@ -483,7 +483,7 @@ fn p010_rgb_u16_limited_range_endpoints() { let y = [0x1000u16, 0xEB00]; let uv = [0x8000u16, 0x8000]; let mut rgb = [0u16; 6]; - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb, 2, ColorMatrix::Bt709, false); assert_eq!((rgb[0], rgb[1], rgb[2]), (0, 0, 0)); assert_eq!((rgb[3], rgb[4], rgb[5]), (1023, 1023, 1023)); } @@ -498,7 +498,7 @@ fn yuv444p10_rgba_gray_alpha_is_ff() { let u = [512u16; 4]; let v = [512u16; 4]; let mut rgba = [0u8; 16]; - yuv_444p_n_to_rgba_row::<10>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + yuv_444p_n_to_rgba_row::<10, false>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], @@ -522,7 +522,7 @@ fn yuv444p10_rgba_u16_gray_alpha_is_1023() { let u = [512u16; 4]; let v = [512u16; 4]; let mut rgba = [0u16; 16]; - yuv_444p_n_to_rgba_u16_row::<10>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + yuv_444p_n_to_rgba_u16_row::<10, false>(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], @@ -546,7 +546,7 @@ fn yuv444p16_rgba_gray_alpha_is_ff() { let u = [0x8000u16; 4]; let v = [0x8000u16; 4]; let mut rgba = [0u8; 16]; - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], @@ -570,7 +570,7 @@ fn yuv444p16_rgba_u16_gray_alpha_is_ffff() { let u = [0x8000u16; 4]; let v = [0x8000u16; 4]; let mut rgba = [0u16; 16]; - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], @@ -596,7 +596,7 @@ fn p410_rgba_gray_alpha_is_ff() { // 4 pixels × (U,V) per pixel = 8 elements. let uv = [0x8000u16; 8]; let mut rgba = [0u8; 16]; - p_n_444_to_rgba_row::<10>(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); + p_n_444_to_rgba_row::<10, false>(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], @@ -620,7 +620,7 @@ fn p416_rgba_u16_gray_alpha_is_ffff() { let y = [0x8000u16; 4]; let uv = [0x8000u16; 8]; let mut rgba = [0u16; 16]; - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba, 4, ColorMatrix::Bt601, true); for x in 0..4 { let (r, g, b, a) = ( rgba[x * 4], diff --git a/src/row/scalar/yuv_planar_16bit.rs b/src/row/scalar/yuv_planar_16bit.rs index 0ff73bbb..c49db3b2 100644 --- a/src/row/scalar/yuv_planar_16bit.rs +++ b/src/row/scalar/yuv_planar_16bit.rs @@ -1,4 +1,4 @@ -use super::*; +use super::{load_u16, *}; // ---- 16-bit YUV 4:2:0 → RGB (parallel kernel family) ------------------- // @@ -36,7 +36,7 @@ use super::*; /// - `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p16_to_rgb_row( +pub(crate) fn yuv_420p16_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -45,7 +45,7 @@ pub(crate) fn yuv_420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -61,7 +61,7 @@ pub(crate) fn yuv_420p16_to_rgb_row( // `row::yuv420p16_to_rgba_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p16_to_rgba_row( +pub(crate) fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -70,7 +70,7 @@ pub(crate) fn yuv_420p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -92,7 +92,7 @@ pub(crate) fn yuv_420p16_to_rgba_row( /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -102,7 +102,7 @@ pub(crate) fn yuv_420p16_to_rgba_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -127,7 +127,7 @@ pub(crate) fn yuv_420p16_to_rgba_with_alpha_src_row( /// u16 is in range. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) fn yuv_420p16_to_rgb_or_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -162,14 +162,14 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_row(u_half[c_idx]) as i32 - bias, c_scale); + let v_d = q15_scale(load_u16::(v_half[c_idx]) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -177,17 +177,17 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_row> 8` to fit the u8 output. - out[x * bpp + 3] = (a_src.as_ref().unwrap()[x] >> 8) as u8; + out[x * bpp + 3] = (load_u16::(a_src.as_ref().unwrap()[x]) >> 8) as u8; } else if ALPHA { out[x * bpp + 3] = 0xFF; } - let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale); + let y1 = q15_scale(load_u16::(y[x + 1]) as i32 - y_off, y_scale); out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); if ALPHA_SRC { - out[(x + 1) * bpp + 3] = (a_src.as_ref().unwrap()[x + 1] >> 8) as u8; + out[(x + 1) * bpp + 3] = (load_u16::(a_src.as_ref().unwrap()[x + 1]) >> 8) as u8; } else if ALPHA { out[(x + 1) * bpp + 3] = 0xFF; } @@ -208,7 +208,7 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -217,7 +217,7 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -232,7 +232,7 @@ pub(crate) fn yuv_420p16_to_rgb_u16_row( // `row::yuv420p16_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p16_to_rgba_u16_row( +pub(crate) fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -241,7 +241,7 @@ pub(crate) fn yuv_420p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -264,7 +264,7 @@ pub(crate) fn yuv_420p16_to_rgba_u16_row( /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -274,7 +274,7 @@ pub(crate) fn yuv_420p16_to_rgba_u16_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -297,7 +297,11 @@ pub(crate) fn yuv_420p16_to_rgba_u16_with_alpha_src_row( /// at native depth (full u16 range, no `bits_mask` needed). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -330,14 +334,14 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row(u_half[c_idx]) as i32 - bias, c_scale); + let v_d = q15_scale(load_u16::(v_half[c_idx]) as i32 - bias, c_scale); let r_chroma = q15_chroma64(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma64(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale64(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -345,17 +349,17 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row(a_src.as_ref().unwrap()[x]); } else if ALPHA { out[x * bpp + 3] = 0xFFFF; } - let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale); + let y1 = q15_scale64(load_u16::(y[x + 1]) as i32 - y_off, y_scale); out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; if ALPHA_SRC { - out[(x + 1) * bpp + 3] = a_src.as_ref().unwrap()[x + 1]; + out[(x + 1) * bpp + 3] = load_u16::(a_src.as_ref().unwrap()[x + 1]); } else if ALPHA { out[(x + 1) * bpp + 3] = 0xFFFF; } @@ -370,7 +374,7 @@ pub(crate) fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -379,7 +383,7 @@ pub(crate) fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgb_out, width, matrix, full_range); + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgb_out, width, matrix, full_range); } /// YUV 4:4:4 planar **16‑bit** → packed **8‑bit** **RGBA**. Same @@ -389,7 +393,7 @@ pub(crate) fn yuv_444p16_to_rgb_row( /// /// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p16_to_rgba_row( +pub(crate) fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -398,7 +402,7 @@ pub(crate) fn yuv_444p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgba_out, width, matrix, full_range); + yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgba_out, width, matrix, full_range); } /// YUVA 4:4:4 16‑bit → packed **8‑bit** **RGBA**. Same numerical @@ -417,7 +421,7 @@ pub(crate) fn yuv_444p16_to_rgba_row( /// `a_src.len() >= width`, `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -427,7 +431,7 @@ pub(crate) fn yuv_444p16_to_rgba_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -452,7 +456,11 @@ pub(crate) fn yuv_444p16_to_rgba_with_alpha_src_row( /// u16 is in range. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -481,14 +489,14 @@ pub(crate) fn yuv_444p16_to_rgb_or_rgba_row(); for x in 0..width { - let u_d = q15_scale(u[x] as i32 - bias, c_scale); - let v_d = q15_scale(v[x] as i32 - bias, c_scale); + let u_d = q15_scale(load_u16::(u[x]) as i32 - bias, c_scale); + let v_d = q15_scale(load_u16::(v[x]) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -496,7 +504,7 @@ pub(crate) fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit the u8 output. - out[x * bpp + 3] = (a_src.as_ref().unwrap()[x] >> 8) as u8; + out[x * bpp + 3] = (load_u16::(a_src.as_ref().unwrap()[x]) >> 8) as u8; } else if ALPHA { out[x * bpp + 3] = 0xFF; } @@ -511,7 +519,7 @@ pub(crate) fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -520,7 +528,7 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -530,7 +538,7 @@ pub(crate) fn yuv_444p16_to_rgb_u16_row( /// /// Thin wrapper over [`yuv_444p16_to_rgb_or_rgba_u16_row`] with `ALPHA = true`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p16_to_rgba_u16_row( +pub(crate) fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -539,7 +547,7 @@ pub(crate) fn yuv_444p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -560,7 +568,7 @@ pub(crate) fn yuv_444p16_to_rgba_u16_row( /// `a_src.len() >= width`, `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -570,7 +578,7 @@ pub(crate) fn yuv_444p16_to_rgba_u16_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -593,7 +601,11 @@ pub(crate) fn yuv_444p16_to_rgba_u16_with_alpha_src_row( /// [`yuv_444p16_to_rgb_u16_row`]). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -623,21 +635,21 @@ pub(crate) fn yuv_444p16_to_rgb_or_rgba_u16_row(u[x]) as i32 - bias, c_scale); + let v_d = q15_scale(load_u16::(v[x]) as i32 - bias, c_scale); let r_chroma = q15_chroma64(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma64(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale64(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). // 16-bit native-depth output keeps alpha verbatim — no shift. - out[x * bpp + 3] = a_src.as_ref().unwrap()[x]; + out[x * bpp + 3] = load_u16::(a_src.as_ref().unwrap()[x]); } else if ALPHA { out[x * bpp + 3] = 0xFFFF; } @@ -660,7 +672,7 @@ pub(crate) fn yuv_444p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -668,7 +680,7 @@ pub(crate) fn p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P016** to **8-bit** packed **RGBA**. Same @@ -681,7 +693,7 @@ pub(crate) fn p16_to_rgb_row( // `row::p016_to_rgba_row` lands in the follow-up SIMD/dispatcher PR. // Until then this thin wrapper has no caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p16_to_rgba_row( +pub(crate) fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -689,13 +701,13 @@ pub(crate) fn p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } /// Shared P016 → 8-bit RGB / RGBA kernel. `ALPHA = false` emits 3 bpp; /// `ALPHA = true` emits 4 bpp with constant `0xFF` alpha. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p16_to_rgb_or_rgba_row( +pub(crate) fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -716,8 +728,8 @@ pub(crate) fn p16_to_rgb_or_rgba_row( let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2]; - let v_sample = uv_half[c_idx * 2 + 1]; + let u_sample = load_u16::(uv_half[c_idx * 2]); + let v_sample = load_u16::(uv_half[c_idx * 2 + 1]); let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -725,7 +737,7 @@ pub(crate) fn p16_to_rgb_or_rgba_row( let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -733,7 +745,7 @@ pub(crate) fn p16_to_rgb_or_rgba_row( out[x * bpp + 3] = 0xFF; } - let y1 = q15_scale(y[x + 1] as i32 - y_off, y_scale); + let y1 = q15_scale(load_u16::(y[x + 1]) as i32 - y_off, y_scale); out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); @@ -751,7 +763,7 @@ pub(crate) fn p16_to_rgb_or_rgba_row( /// /// Thin wrapper over [`p16_to_rgb_or_rgba_u16_row`] with `ALPHA = false`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p16_to_rgb_u16_row( +pub(crate) fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -759,7 +771,7 @@ pub(crate) fn p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } /// Converts one row of **P016** to **native-depth `u16`** packed @@ -771,7 +783,7 @@ pub(crate) fn p16_to_rgb_u16_row( // `row::p016_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p16_to_rgba_u16_row( +pub(crate) fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -779,7 +791,7 @@ pub(crate) fn p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } /// Shared P016 → native-depth `u16` RGB / RGBA kernel. `ALPHA = false` @@ -788,7 +800,7 @@ pub(crate) fn p16_to_rgba_u16_row( /// /// Uses i64 chroma multiply (same rationale as [`yuv_420p16_to_rgb_or_rgba_u16_row`]). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn p16_to_rgb_or_rgba_u16_row( +pub(crate) fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -810,8 +822,8 @@ pub(crate) fn p16_to_rgb_or_rgba_u16_row( let mut x = 0; while x < width { let c_idx = x / 2; - let u_sample = uv_half[c_idx * 2]; - let v_sample = uv_half[c_idx * 2 + 1]; + let u_sample = load_u16::(uv_half[c_idx * 2]); + let v_sample = load_u16::(uv_half[c_idx * 2 + 1]); let u_d = q15_scale(u_sample as i32 - bias, c_scale); let v_d = q15_scale(v_sample as i32 - bias, c_scale); @@ -819,7 +831,7 @@ pub(crate) fn p16_to_rgb_or_rgba_u16_row( let g_chroma = q15_chroma64(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma64(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale64(y[x] as i32 - y_off, y_scale); + let y0 = q15_scale64(load_u16::(y[x]) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -827,7 +839,7 @@ pub(crate) fn p16_to_rgb_or_rgba_u16_row( out[x * bpp + 3] = 0xFFFF; } - let y1 = q15_scale64(y[x + 1] as i32 - y_off, y_scale); + let y1 = q15_scale64(load_u16::(y[x + 1]) as i32 - y_off, y_scale); out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; diff --git a/src/row/scalar/yuv_planar_high_bit.rs b/src/row/scalar/yuv_planar_high_bit.rs index 22e22922..3816d91d 100644 --- a/src/row/scalar/yuv_planar_high_bit.rs +++ b/src/row/scalar/yuv_planar_high_bit.rs @@ -1,4 +1,4 @@ -use super::*; +use super::{load_u16, *}; // ---- High-bit-depth YUV 4:2:0 → RGB (BITS ∈ {10, 12, 14}) ------------- @@ -25,7 +25,7 @@ use super::*; /// - `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p_n_to_rgb_row( +pub(crate) fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -34,7 +34,7 @@ pub(crate) fn yuv_420p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -56,7 +56,7 @@ pub(crate) fn yuv_420p_n_to_rgb_row( // follow-up SIMD/dispatcher PR. Until then this thin wrapper has no // caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p_n_to_rgba_row( +pub(crate) fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -65,7 +65,7 @@ pub(crate) fn yuv_420p_n_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -88,7 +88,7 @@ pub(crate) fn yuv_420p_n_to_rgba_row( /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -98,7 +98,7 @@ pub(crate) fn yuv_420p_n_to_rgba_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -137,6 +137,7 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -175,26 +176,22 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row< let bias = chroma_bias::(); let mask = bits_mask::(); - // Every sample is AND‑masked to the low `BITS` bits on load. This - // eliminates architecture‑dependent divergence on mispacked input - // (e.g. `p010`‑style buffers where the 10 active bits sit in the - // high bits of each `u16`): after masking, every backend sees the - // same in‑range sample, so the whole Q15 pipeline stays bounded - // (intermediate chroma sums fit i16 as designed, no saturating - // narrow loses information). For valid input every mask is a + // Every sample is AND‑masked to the low `BITS` bits on load (after + // optional BE byte-swap). This eliminates architecture‑dependent + // divergence on mispacked input. For valid input every mask is a // no‑op. For malformed input the "wrong" output is identical // across scalar + all 5 SIMD backends. let mut x = 0; while x < width { let c_idx = x / 2; - let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale); - let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale); + let u_d = q15_scale((load_u16::(u_half[c_idx]) & mask) as i32 - bias, c_scale); + let v_d = q15_scale((load_u16::(v_half[c_idx]) & mask) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) & mask) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -205,18 +202,18 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row< // out-of-range u16 samples, and an unmasked overrange value // (e.g. 1024 at BITS=10) would shift down to 256 → cast-to-u8 0, // silently turning over-range alpha into transparent output. - let a_u16 = a_src.as_ref().unwrap()[x] & mask; + let a_u16 = load_u16::(a_src.as_ref().unwrap()[x]) & mask; out[x * bpp + 3] = (a_u16 >> (BITS - 8)) as u8; } else if ALPHA { out[x * bpp + 3] = 0xFF; } - let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); + let y1 = q15_scale((load_u16::(y[x + 1]) & mask) as i32 - y_off, y_scale); out[(x + 1) * bpp] = clamp_u8(y1 + r_chroma); out[(x + 1) * bpp + 1] = clamp_u8(y1 + g_chroma); out[(x + 1) * bpp + 2] = clamp_u8(y1 + b_chroma); if ALPHA_SRC { - let a_u16 = a_src.as_ref().unwrap()[x + 1] & mask; + let a_u16 = load_u16::(a_src.as_ref().unwrap()[x + 1]) & mask; out[(x + 1) * bpp + 3] = (a_u16 >> (BITS - 8)) as u8; } else if ALPHA { out[(x + 1) * bpp + 3] = 0xFF; @@ -253,7 +250,7 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row< /// - `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p_n_to_rgb_u16_row( +pub(crate) fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -262,7 +259,7 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -284,7 +281,7 @@ pub(crate) fn yuv_420p_n_to_rgb_u16_row( // `row::yuv420p10_to_rgba_u16_row` lands in the follow-up SIMD/dispatcher // PR. Until then this thin wrapper has no caller. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_420p_n_to_rgba_u16_row( +pub(crate) fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -293,7 +290,7 @@ pub(crate) fn yuv_420p_n_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -318,7 +315,7 @@ pub(crate) fn yuv_420p_n_to_rgba_u16_row( /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -328,7 +325,7 @@ pub(crate) fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -361,6 +358,7 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -397,25 +395,20 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row< let mask = bits_mask::(); let alpha_max: u16 = out_max as u16; - // Every sample AND‑masked to the low `BITS` bits — see matching - // comment in [`yuv_420p_n_to_rgb_or_rgba_row`]. Critical for the - // native‑depth u16 output path: `range_params_n::<10, 10>` uses - // `y_scale = c_scale = 32768` (unit Q15 for BITS==OUT_BITS full - // range), so an unmasked out‑of‑range sample would push `u_d` / - // `v_d` to ±32256 and the subsequent `coeff * v_d` exceeds i16 - // range — breaking the SIMD kernels' `vqmovn_s32` narrow step. + // Every sample AND‑masked to the low `BITS` bits (after optional BE + // byte-swap). See matching comment in `yuv_420p_n_to_rgb_or_rgba_row`. // Masking keeps every intermediate bounded by design. let mut x = 0; while x < width { let c_idx = x / 2; - let u_d = q15_scale((u_half[c_idx] & mask) as i32 - bias, c_scale); - let v_d = q15_scale((v_half[c_idx] & mask) as i32 - bias, c_scale); + let u_d = q15_scale((load_u16::(u_half[c_idx]) & mask) as i32 - bias, c_scale); + let v_d = q15_scale((load_u16::(v_half[c_idx]) & mask) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) & mask) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -425,17 +418,17 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row< // out-of-range u16 samples, and the documented native-depth // output range is `[0, (1 << BITS) - 1]`. Without masking, an // overrange `1024` at BITS=10 would leak straight to output. - out[x * bpp + 3] = a_src.as_ref().unwrap()[x] & mask; + out[x * bpp + 3] = load_u16::(a_src.as_ref().unwrap()[x]) & mask; } else if ALPHA { out[x * bpp + 3] = alpha_max; } - let y1 = q15_scale((y[x + 1] & mask) as i32 - y_off, y_scale); + let y1 = q15_scale((load_u16::(y[x + 1]) & mask) as i32 - y_off, y_scale); out[(x + 1) * bpp] = (y1 + r_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 1] = (y1 + g_chroma).clamp(0, out_max) as u16; out[(x + 1) * bpp + 2] = (y1 + b_chroma).clamp(0, out_max) as u16; if ALPHA_SRC { - out[(x + 1) * bpp + 3] = a_src.as_ref().unwrap()[x + 1] & mask; + out[(x + 1) * bpp + 3] = load_u16::(a_src.as_ref().unwrap()[x + 1]) & mask; } else if ALPHA { out[(x + 1) * bpp + 3] = alpha_max; } @@ -457,7 +450,7 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p_n_to_rgb_row( +pub(crate) fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -466,7 +459,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -484,7 +477,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( /// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p_n_to_rgba_row( +pub(crate) fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -493,7 +486,7 @@ pub(crate) fn yuv_444p_n_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -513,7 +506,7 @@ pub(crate) fn yuv_444p_n_to_rgba_row( /// `a_src.len() >= width`, `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -523,7 +516,7 @@ pub(crate) fn yuv_444p_n_to_rgba_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -560,6 +553,7 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -598,14 +592,14 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_row< for x in 0..width { // 4:4:4: one UV pair per pixel, no subsampling. - let u_d = q15_scale((u[x] & mask) as i32 - bias, c_scale); - let v_d = q15_scale((v[x] & mask) as i32 - bias, c_scale); + let u_d = q15_scale((load_u16::(u[x]) & mask) as i32 - bias, c_scale); + let v_d = q15_scale((load_u16::(v[x]) & mask) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) & mask) as i32 - y_off, y_scale); out[x * bpp] = clamp_u8(y0 + r_chroma); out[x * bpp + 1] = clamp_u8(y0 + g_chroma); out[x * bpp + 2] = clamp_u8(y0 + b_chroma); @@ -616,7 +610,7 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_row< // out-of-range u16 samples, and an unmasked overrange value // (e.g. 1024 at BITS=10) would shift down to 256 → cast-to-u8 0, // silently turning over-range alpha into transparent output. - let a_u16 = a_src.as_ref().unwrap()[x] & mask; + let a_u16 = load_u16::(a_src.as_ref().unwrap()[x]) & mask; out[x * bpp + 3] = (a_u16 >> (BITS - 8)) as u8; } else if ALPHA { out[x * bpp + 3] = 0xFF; @@ -635,7 +629,7 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_row< /// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `rgb_out.len() >= 3 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p_n_to_rgb_u16_row( +pub(crate) fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -644,7 +638,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -663,7 +657,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( /// - `y.len() >= width`, `u.len() >= width`, `v.len() >= width`, /// `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn yuv_444p_n_to_rgba_u16_row( +pub(crate) fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -672,7 +666,7 @@ pub(crate) fn yuv_444p_n_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -693,7 +687,7 @@ pub(crate) fn yuv_444p_n_to_rgba_u16_row( /// `a_src.len() >= width`, `rgba_out.len() >= 4 * width`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -703,7 +697,7 @@ pub(crate) fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -734,6 +728,7 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -770,14 +765,14 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_u16_row< let alpha_max: u16 = out_max as u16; for x in 0..width { - let u_d = q15_scale((u[x] & mask) as i32 - bias, c_scale); - let v_d = q15_scale((v[x] & mask) as i32 - bias, c_scale); + let u_d = q15_scale((load_u16::(u[x]) & mask) as i32 - bias, c_scale); + let v_d = q15_scale((load_u16::(v[x]) & mask) as i32 - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); let b_chroma = q15_chroma(coeffs.b_u(), u_d, coeffs.b_v(), v_d); - let y0 = q15_scale((y[x] & mask) as i32 - y_off, y_scale); + let y0 = q15_scale((load_u16::(y[x]) & mask) as i32 - y_off, y_scale); out[x * bpp] = (y0 + r_chroma).clamp(0, out_max) as u16; out[x * bpp + 1] = (y0 + g_chroma).clamp(0, out_max) as u16; out[x * bpp + 2] = (y0 + b_chroma).clamp(0, out_max) as u16; @@ -787,7 +782,7 @@ pub(crate) fn yuv_444p_n_to_rgb_or_rgba_u16_row< // out-of-range u16 samples, and the documented native-depth // output range is `[0, (1 << BITS) - 1]`. Without masking, an // overrange `1024` at BITS=10 would leak straight to output. - out[x * bpp + 3] = a_src.as_ref().unwrap()[x] & mask; + out[x * bpp + 3] = load_u16::(a_src.as_ref().unwrap()[x]) & mask; } else if ALPHA { out[x * bpp + 3] = alpha_max; } diff --git a/src/sinker/mixed/tests/yuva/sub_4_2_0.rs b/src/sinker/mixed/tests/yuva/sub_4_2_0.rs index 8d1c8f77..fb03d662 100644 --- a/src/sinker/mixed/tests/yuva/sub_4_2_0.rs +++ b/src/sinker/mixed/tests/yuva/sub_4_2_0.rs @@ -867,7 +867,7 @@ fn yuva420p9_strategy_a_plus_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgb_row::<9, false>( y_row, u_row, v_row, @@ -876,7 +876,7 @@ fn yuva420p9_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -956,7 +956,7 @@ fn yuva420p9_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<9, false>( y_row, u_row, v_row, @@ -965,7 +965,7 @@ fn yuva420p9_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -1047,7 +1047,7 @@ fn yuva420p10_strategy_a_plus_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgb_row::<10, false>( y_row, u_row, v_row, @@ -1056,7 +1056,7 @@ fn yuva420p10_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -1136,7 +1136,7 @@ fn yuva420p10_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( y_row, u_row, v_row, @@ -1145,7 +1145,7 @@ fn yuva420p10_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -1229,7 +1229,7 @@ fn yuva420p16_strategy_a_plus_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p16_to_rgb_row( + crate::row::scalar::yuv_420p16_to_rgb_row::( y_row, u_row, v_row, @@ -1238,7 +1238,7 @@ fn yuva420p16_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p16_to_rgba_with_alpha_src_row( + crate::row::scalar::yuv_420p16_to_rgba_with_alpha_src_row::( y_row, u_row, v_row, @@ -1318,7 +1318,7 @@ fn yuva420p16_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[(r / 2) * cw..(r / 2 + 1) * cw]; let v_row = &vp[(r / 2) * cw..(r / 2 + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p16_to_rgb_u16_row( + crate::row::scalar::yuv_420p16_to_rgb_u16_row::( y_row, u_row, v_row, @@ -1327,7 +1327,7 @@ fn yuva420p16_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + crate::row::scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y_row, u_row, v_row, diff --git a/src/sinker/mixed/tests/yuva/sub_4_2_2.rs b/src/sinker/mixed/tests/yuva/sub_4_2_2.rs index 5e7d9464..32956945 100644 --- a/src/sinker/mixed/tests/yuva/sub_4_2_2.rs +++ b/src/sinker/mixed/tests/yuva/sub_4_2_2.rs @@ -516,7 +516,7 @@ fn yuva422p9_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgb_row::<9, false>( y_row, u_row, v_row, @@ -525,7 +525,7 @@ fn yuva422p9_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -604,7 +604,7 @@ fn yuva422p9_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<9, false>( y_row, u_row, v_row, @@ -613,7 +613,7 @@ fn yuva422p9_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -692,7 +692,7 @@ fn yuva422p10_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgb_row::<10, false>( y_row, u_row, v_row, @@ -701,7 +701,7 @@ fn yuva422p10_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -780,7 +780,7 @@ fn yuva422p10_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( y_row, u_row, v_row, @@ -789,7 +789,7 @@ fn yuva422p10_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -868,7 +868,7 @@ fn yuva422p12_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_row::<12>( + crate::row::scalar::yuv_420p_n_to_rgb_row::<12, false>( y_row, u_row, v_row, @@ -877,7 +877,7 @@ fn yuva422p12_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12>( + crate::row::scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( y_row, u_row, v_row, @@ -956,7 +956,7 @@ fn yuva422p12_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<12>( + crate::row::scalar::yuv_420p_n_to_rgb_u16_row::<12, false>( y_row, u_row, v_row, @@ -965,7 +965,7 @@ fn yuva422p12_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12>( + crate::row::scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y_row, u_row, v_row, @@ -1045,7 +1045,7 @@ fn yuva422p16_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p16_to_rgb_row( + crate::row::scalar::yuv_420p16_to_rgb_row::( y_row, u_row, v_row, @@ -1054,7 +1054,7 @@ fn yuva422p16_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p16_to_rgba_with_alpha_src_row( + crate::row::scalar::yuv_420p16_to_rgba_with_alpha_src_row::( y_row, u_row, v_row, @@ -1133,7 +1133,7 @@ fn yuva422p16_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * cw..(r + 1) * cw]; let v_row = &vp[r * cw..(r + 1) * cw]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_420p16_to_rgb_u16_row( + crate::row::scalar::yuv_420p16_to_rgb_u16_row::( y_row, u_row, v_row, @@ -1142,7 +1142,7 @@ fn yuva422p16_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + crate::row::scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( y_row, u_row, v_row, diff --git a/src/sinker/mixed/tests/yuva/sub_4_4_4.rs b/src/sinker/mixed/tests/yuva/sub_4_4_4.rs index 9360612b..57017355 100644 --- a/src/sinker/mixed/tests/yuva/sub_4_4_4.rs +++ b/src/sinker/mixed/tests/yuva/sub_4_4_4.rs @@ -1152,7 +1152,7 @@ fn yuva444p9_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_row::<9>( + crate::row::scalar::yuv_444p_n_to_rgb_row::<9, false>( y_row, u_row, v_row, @@ -1161,7 +1161,7 @@ fn yuva444p9_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<9>( + crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -1239,7 +1239,7 @@ fn yuva444p9_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<9>( + crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<9, false>( y_row, u_row, v_row, @@ -1248,7 +1248,7 @@ fn yuva444p9_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9>( + crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( y_row, u_row, v_row, @@ -1326,7 +1326,7 @@ fn yuva444p10_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgb_row::<10, false>( y_row, u_row, v_row, @@ -1335,7 +1335,7 @@ fn yuva444p10_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -1413,7 +1413,7 @@ fn yuva444p10_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<10, false>( y_row, u_row, v_row, @@ -1422,7 +1422,7 @@ fn yuva444p10_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -1500,7 +1500,7 @@ fn yuva444p12_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_row::<12>( + crate::row::scalar::yuv_444p_n_to_rgb_row::<12, false>( y_row, u_row, v_row, @@ -1509,7 +1509,7 @@ fn yuva444p12_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<12>( + crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( y_row, u_row, v_row, @@ -1587,7 +1587,7 @@ fn yuva444p12_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<12>( + crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<12, false>( y_row, u_row, v_row, @@ -1596,7 +1596,7 @@ fn yuva444p12_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12>( + crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( y_row, u_row, v_row, @@ -1674,7 +1674,7 @@ fn yuva444p14_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_row::<14>( + crate::row::scalar::yuv_444p_n_to_rgb_row::<14, false>( y_row, u_row, v_row, @@ -1683,7 +1683,7 @@ fn yuva444p14_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<14>( + crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( y_row, u_row, v_row, @@ -1761,7 +1761,7 @@ fn yuva444p14_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<14>( + crate::row::scalar::yuv_444p_n_to_rgb_u16_row::<14, false>( y_row, u_row, v_row, @@ -1770,7 +1770,7 @@ fn yuva444p14_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14>( + crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( y_row, u_row, v_row, @@ -1849,7 +1849,7 @@ fn yuva444p16_strategy_a_plus_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p16_to_rgb_row( + crate::row::scalar::yuv_444p16_to_rgb_row::( y_row, u_row, v_row, @@ -1858,7 +1858,7 @@ fn yuva444p16_strategy_a_plus_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p16_to_rgba_with_alpha_src_row( + crate::row::scalar::yuv_444p16_to_rgba_with_alpha_src_row::( y_row, u_row, v_row, @@ -1936,7 +1936,7 @@ fn yuva444p16_strategy_a_plus_u16_matches_independent_kernel() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p16_to_rgb_u16_row( + crate::row::scalar::yuv_444p16_to_rgb_u16_row::( y_row, u_row, v_row, @@ -1945,7 +1945,7 @@ fn yuva444p16_strategy_a_plus_u16_matches_independent_kernel() { matrix, full_range, ); - crate::row::scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + crate::row::scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( y_row, u_row, v_row, @@ -2040,7 +2040,7 @@ fn yuva444p10_strategy_a_plus_overrange_alpha_matches_inline_alpha() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( y_row, u_row, v_row, @@ -2077,7 +2077,7 @@ fn yuva444p10_strategy_a_plus_overrange_alpha_matches_inline_alpha() { let u_row = &up[r * width..(r + 1) * width]; let v_row = &vp[r * width..(r + 1) * width]; let a_row = &ap[r * width..(r + 1) * width]; - crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + crate::row::scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( y_row, u_row, v_row, From 63624e95640ae6f59e39988777384337a251a214 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 02:08:19 +1200 Subject: [PATCH 2/8] feat(be-yuv-hb): BE support for x86 + wasm-simd128 high-bit YUV/P-format row kernels MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends the BE-aware kernel rollout (scalar + NEON in 5b989ba) to the remaining four SIMD backends — SSE4.1, AVX2, AVX-512, and wasm-simd128 — for high-bit YUV planar and P-format kernels. Files updated (16 backend files × 4 kernel families): - src/row/arch/x86_sse41/{yuv_planar_high_bit, yuv_planar_16bit, subsampled_high_bit_pn_4_2_0, subsampled_high_bit_pn_4_4_4}.rs - src/row/arch/x86_avx2/{yuv_planar_high_bit, yuv_planar_16bit, subsampled_high_bit_pn_4_2_0, subsampled_high_bit_pn_4_4_4}.rs - src/row/arch/x86_avx512/{yuv_planar_high_bit, yuv_planar_16bit, subsampled_high_bit_pn_4_2_0, subsampled_high_bit_pn_4_4_4}.rs - src/row/arch/wasm_simd128/{yuv_planar_high_bit, yuv_planar_16bit, subsampled_high_bit_pn_4_2_0, subsampled_high_bit_pn_4_4_4}.rs Each pub(crate) row kernel now takes ; SIMD u16 loads go through endian::load_endian_u16x{8,16,32}:: per backend. Tail fallbacks forward BE to the corresponding scalar kernels. Lane strategy decisions per backend: - All planar 4:2:0 / 4:4:4 / 16-bit kernels: native endian-aware `load_endian_u16x{N}::` for every Y/U/V load. - Pn/P016/P416 semi-planar interleaved-UV kernels: deinterleave first via the existing `deinterleave_uv_u16{,_avx2,_avx512,_wasm}` helper, then byte-swap each U/V vector with a local `byteswap_u16x{8,16,32} ::` helper (per-arch). Compiles away when BE = false. Cost is one extra pshufb per deinterleaved vector on BE — strictly minor versus the deinterleave itself, and keeps the LE fast path identical. - AVX2/AVX-512 P016 u16-output paths use a half-width 256-bit / 128-bit inline `shuffle_epi8` for the `_mm256_loadu_si256` / `_mm_loadu_si128` half-loads (no public 256-bit / 128-bit endian helper in the AVX2 / AVX-512 endian module). Same pattern for AVX2/AVX-512/wasm 4:2:0 half-load chroma — fold per-u16-lane swap into the deinterleave mask where possible (P016 u16 paths) or use a dedicated post-load swizzle (4:2:0 u16 outputs). - wasm yuv_420p16 u16-output: half-width `v128_load64_zero` uses an inline `u8x16_swizzle` for the BE byte-swap (high 8 bytes are zero so same shuffle leaves them unchanged). All 9 NEON BE parity tests pass on aarch64 (+9 vs 5b989ba=2159, total 2168). Five new tests/be_parity.rs files added across all 5 backends — 36 BE parity tests total exercising yuv_420p_n, yuv_444p_n, yuv_*p16, P010, P410, P016, P416 across u8 + u16 outputs. Each test takes a randomized LE input, byte-swaps every u16, and asserts kernel on swapped == kernel on original. x86/wasm test sites updated to forward `, false` through generic chain (matches NEON's existing pattern from 5b989ba). Verified: - cargo test --target aarch64-apple-darwin: 2168 passed, 0 failed - cargo build --target x86_64-apple-darwin --tests: clean - RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests: clean - cargo build --no-default-features: clean - cargo fmt --check: clean - cargo clippy --all-targets -- -D warnings: clean (aarch64 host) Sinker call sites remain hardcoded `` per task spec (out of scope for this tranche). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../arch/neon/subsampled_high_bit_pn_4_4_4.rs | 8 +- src/row/arch/neon/tests/be_parity.rs | 278 ++++++++++++++++++ src/row/arch/neon/tests/high_bit_4_2_0.rs | 99 ++++++- .../arch/neon/tests/high_bit_4_4_4_and_pn.rs | 69 ++++- src/row/arch/neon/tests/mod.rs | 1 + src/row/arch/neon/tests/yuva.rs | 29 +- src/row/arch/neon/yuv_planar_16bit.rs | 2 +- .../subsampled_high_bit_pn_4_2_0.rs | 122 +++++--- .../subsampled_high_bit_pn_4_4_4.rs | 138 ++++++--- src/row/arch/wasm_simd128/tests/be_parity.rs | 234 +++++++++++++++ .../arch/wasm_simd128/tests/high_bit_4_2_0.rs | 124 ++++++-- .../tests/high_bit_4_4_4_and_pn.rs | 107 ++++--- src/row/arch/wasm_simd128/tests/mod.rs | 1 + .../wasm_simd128/tests/planar_8bit_and_nv.rs | 28 +- src/row/arch/wasm_simd128/tests/yuva.rs | 67 +++-- src/row/arch/wasm_simd128/yuv_planar_16bit.rs | 182 ++++++++---- .../arch/wasm_simd128/yuv_planar_high_bit.rs | 226 +++++++++----- .../x86_avx2/subsampled_high_bit_pn_4_2_0.rs | 139 ++++++--- .../x86_avx2/subsampled_high_bit_pn_4_4_4.rs | 145 ++++++--- src/row/arch/x86_avx2/tests/be_parity.rs | 261 ++++++++++++++++ src/row/arch/x86_avx2/tests/high_bit_4_2_0.rs | 112 +++++-- .../x86_avx2/tests/high_bit_4_4_4_and_pn.rs | 107 ++++--- src/row/arch/x86_avx2/tests/mod.rs | 1 + .../arch/x86_avx2/tests/planar_8bit_and_nv.rs | 40 ++- src/row/arch/x86_avx2/tests/yuva.rs | 67 +++-- src/row/arch/x86_avx2/yuv_planar_16bit.rs | 174 +++++++---- src/row/arch/x86_avx2/yuv_planar_high_bit.rs | 212 ++++++++----- .../subsampled_high_bit_pn_4_2_0.rs | 134 ++++++--- .../subsampled_high_bit_pn_4_4_4.rs | 146 ++++++--- src/row/arch/x86_avx512/tests/be_parity.rs | 265 +++++++++++++++++ .../arch/x86_avx512/tests/high_bit_4_2_0.rs | 112 +++++-- .../x86_avx512/tests/high_bit_4_4_4_and_pn.rs | 107 ++++--- src/row/arch/x86_avx512/tests/mod.rs | 1 + .../x86_avx512/tests/planar_8bit_and_nv.rs | 48 ++- src/row/arch/x86_avx512/tests/yuva.rs | 67 +++-- src/row/arch/x86_avx512/yuv_planar_16bit.rs | 171 +++++++---- .../arch/x86_avx512/yuv_planar_high_bit.rs | 212 ++++++++----- .../x86_sse41/subsampled_high_bit_pn_4_2_0.rs | 142 ++++++--- .../x86_sse41/subsampled_high_bit_pn_4_4_4.rs | 145 ++++++--- src/row/arch/x86_sse41/tests/be_parity.rs | 261 ++++++++++++++++ .../arch/x86_sse41/tests/high_bit_4_2_0.rs | 112 +++++-- .../x86_sse41/tests/high_bit_4_4_4_and_pn.rs | 107 ++++--- src/row/arch/x86_sse41/tests/mod.rs | 1 + .../x86_sse41/tests/planar_8bit_and_nv.rs | 40 ++- src/row/arch/x86_sse41/tests/yuva.rs | 67 +++-- src/row/arch/x86_sse41/yuv_planar_16bit.rs | 176 +++++++---- src/row/arch/x86_sse41/yuv_planar_high_bit.rs | 226 +++++++++----- src/row/dispatch/yuv420/yuv420p10.rs | 12 +- src/row/dispatch/yuv420/yuv420p12.rs | 12 +- src/row/dispatch/yuv420/yuv420p14.rs | 12 +- src/row/dispatch/yuv420/yuv420p16.rs | 4 +- src/row/dispatch/yuv420/yuv420p9.rs | 12 +- src/row/dispatch/yuva/sub_4_4_4.rs | 4 +- src/row/scalar/yuv_planar_16bit.rs | 14 +- src/row/scalar/yuv_planar_high_bit.rs | 20 +- 55 files changed, 4376 insertions(+), 1227 deletions(-) create mode 100644 src/row/arch/neon/tests/be_parity.rs create mode 100644 src/row/arch/wasm_simd128/tests/be_parity.rs create mode 100644 src/row/arch/x86_avx2/tests/be_parity.rs create mode 100644 src/row/arch/x86_avx512/tests/be_parity.rs create mode 100644 src/row/arch/x86_sse41/tests/be_parity.rs diff --git a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs index ae731333..2367e9ef 100644 --- a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs @@ -671,9 +671,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::( - y, uv_full, rgb_out, width, matrix, full_range, - ); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -698,9 +696,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::( - y, uv_full, rgba_out, width, matrix, full_range, - ); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } } diff --git a/src/row/arch/neon/tests/be_parity.rs b/src/row/arch/neon/tests/be_parity.rs new file mode 100644 index 00000000..f774fac8 --- /dev/null +++ b/src/row/arch/neon/tests/be_parity.rs @@ -0,0 +1,278 @@ +//! BE parity tests for NEON high-bit YUV / P-format kernels. +//! +//! Each test takes a randomized LE input buffer, byte-swaps every u16 +//! element to produce a BE-encoded buffer, then asserts that +//! `kernel::(swapped_input)` produces byte-identical output +//! to `kernel::(original_input)`. This is the formal +//! parity contract for the BE-aware kernels: BE input is a swapped +//! representation of the same logical pixel data, so the output must +//! match. + +use crate::row::neon_available; + +use super::{ + super::*, high_bit_plane, interleave_uv, p_n_packed_plane, p010_uv_interleave, p16_plane_neon, + planar_n_plane, +}; + +fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf.iter().map(|x| x.swap_bytes()).collect() +} + +// ---- yuv_420p_n (planar 4:2:0 high-bit) ----------------------------- + +#[test] +fn neon_yuv_420p10_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = planar_n_plane::<10>(width, 13); + let u = planar_n_plane::<10>(width / 2, 17); + let v = planar_n_plane::<10>(width / 2, 19); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + for matrix in [ColorMatrix::Bt709, ColorMatrix::Bt2020Ncl] { + for full_range in [true, false] { + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + matrix, + full_range, + ); + } + assert_eq!(out_le, out_be, "matrix={matrix:?} full_range={full_range}"); + } + } +} + +#[test] +fn neon_yuv_420p10_be_parity_u16() { + if !neon_available() { + return; + } + let width = 32; + let y = planar_n_plane::<10>(width, 23); + let u = planar_n_plane::<10>(width / 2, 29); + let v = planar_n_plane::<10>(width / 2, 31); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + yuv_420p_n_to_rgb_u16_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +// ---- yuv_444p_n (planar 4:4:4 high-bit) ----------------------------- + +#[test] +fn neon_yuv_444p12_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = planar_n_plane::<12>(width, 41); + let u = planar_n_plane::<12>(width, 43); + let v = planar_n_plane::<12>(width, 47); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +// ---- yuv_*p16 (16-bit planar) ---------------------------------------- + +#[test] +fn neon_yuv_420p16_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = p16_plane_neon(width, 53); + let u = p16_plane_neon(width / 2, 59); + let v = p16_plane_neon(width / 2, 61); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn neon_yuv_444p16_be_parity_u16() { + if !neon_available() { + return; + } + let width = 32; + let y = p16_plane_neon(width, 67); + let u = p16_plane_neon(width, 71); + let v = p16_plane_neon(width, 73); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +// ---- p_n / p_n_444 (semi-planar high-bit-packed) -------------------- + +#[test] +fn neon_p010_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = p_n_packed_plane::<10>(width, 79); + let u_half = p_n_packed_plane::<10>(width / 2, 83); + let v_half = p_n_packed_plane::<10>(width / 2, 89); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn neon_p410_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = p_n_packed_plane::<10>(width, 97); + let u_full = high_bit_plane::<10>(width, 101); + let v_full = high_bit_plane::<10>(width, 103); + let uv_full = interleave_uv(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn neon_p016_be_parity_u8() { + if !neon_available() { + return; + } + let width = 32; + let y = p16_plane_neon(width, 107); + let u_half = p16_plane_neon(width / 2, 109); + let v_half = p16_plane_neon(width / 2, 113); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn neon_p416_be_parity_u16() { + if !neon_available() { + return; + } + let width = 32; + let y = p16_plane_neon(width, 127); + let u_full = p16_plane_neon(width, 131); + let v_full = p16_plane_neon(width, 137); + let uv_full = interleave_uv(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} diff --git a/src/row/arch/neon/tests/high_bit_4_2_0.rs b/src/row/arch/neon/tests/high_bit_4_2_0.rs index d6368382..8bad3dd9 100644 --- a/src/row/arch/neon/tests/high_bit_4_2_0.rs +++ b/src/row/arch/neon/tests/high_bit_4_2_0.rs @@ -18,7 +18,15 @@ fn check_p10_u8_equivalence(width: usize, matrix: ColorMatrix, full_range: bool) let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -43,7 +51,15 @@ fn check_p10_u16_equivalence(width: usize, matrix: ColorMatrix, full_range: bool let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -131,7 +147,15 @@ fn check_p_n_u8_equivalence(width: usize, matrix: ColorMatrix, let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -148,7 +172,15 @@ fn check_p_n_u16_equivalence(width: usize, matrix: ColorMatrix, let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -239,7 +271,15 @@ fn neon_p10_matches_scalar_on_out_of_range_samples() { for full_range in [true, false] { let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -260,7 +300,15 @@ fn neon_p10_matches_scalar_on_out_of_range_samples() { full_range, ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb16_neon, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb16_neon, + width, + matrix, + full_range, + ); } assert_eq!( rgb16_scalar, rgb16_neon, @@ -429,7 +477,14 @@ fn neon_p010_matches_scalar_on_mispacked_input() { let mut rgb16_scalar = std::vec![0u16; width * 3]; let mut rgb16_neon = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb16_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>( + &y, + &uv, + &mut rgb16_scalar, + width, + matrix, + full_range, + ); unsafe { p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb16_neon, width, matrix, full_range); } @@ -454,7 +509,15 @@ fn check_planar_u8_neon_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -471,7 +534,15 @@ fn check_planar_u16_neon_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -592,7 +663,15 @@ fn check_planar_u8_neon_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } diff --git a/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs index dc1accc2..24ebaac3 100644 --- a/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/neon/tests/high_bit_4_4_4_and_pn.rs @@ -31,7 +31,15 @@ fn check_planar_u16_neon_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_neon, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_neon, @@ -124,7 +132,15 @@ fn check_yuv420p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_neon(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } @@ -204,7 +220,15 @@ fn check_yuv444p_n_u8_neon_equivalence( let v = planar_n_plane::(width, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_neon = std::vec![0u8; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -224,7 +248,15 @@ fn check_yuv444p_n_u16_neon_equivalence( let v = planar_n_plane::(width, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -334,7 +366,15 @@ fn check_yuv444p16_u16_neon_equivalence(width: usize, matrix: ColorMatrix, full_ let v = p16_plane_neon(width, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_neon, width, matrix, full_range); } @@ -405,7 +445,14 @@ fn check_p_n_444_u16_neon_equivalence( let uv = interleave_uv(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_neon = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::( + &y, + &uv, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_neon, width, matrix, full_range); } @@ -534,7 +581,15 @@ fn check_yuv444p_n_u8_neon_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_neon = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } diff --git a/src/row/arch/neon/tests/mod.rs b/src/row/arch/neon/tests/mod.rs index aa1e28ac..f31c0bbc 100644 --- a/src/row/arch/neon/tests/mod.rs +++ b/src/row/arch/neon/tests/mod.rs @@ -1,6 +1,7 @@ use super::*; mod ayuv64; +mod be_parity; mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; diff --git a/src/row/arch/neon/tests/yuva.rs b/src/row/arch/neon/tests/yuva.rs index 1f871035..aa94ab9e 100644 --- a/src/row/arch/neon/tests/yuva.rs +++ b/src/row/arch/neon/tests/yuva.rs @@ -390,7 +390,15 @@ fn check_yuv444p_n_u16_neon_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_neon, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_neon, @@ -409,7 +417,14 @@ fn check_pn_444_u16_neon_rgba_equivalence( let uv = interleave_uv(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::( + &y, + &uv, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_neon, width, matrix, full_range); } @@ -425,7 +440,15 @@ fn check_yuv444p16_u16_neon_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_neon(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_neon = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_neon, width, matrix, full_range); } diff --git a/src/row/arch/neon/yuv_planar_16bit.rs b/src/row/arch/neon/yuv_planar_16bit.rs index 7a5f3774..7b7869c5 100644 --- a/src/row/arch/neon/yuv_planar_16bit.rs +++ b/src/row/arch/neon/yuv_planar_16bit.rs @@ -1100,7 +1100,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< // 16-bit alpha is full-range u16 — byte-swap if BE, then // load 8 lanes verbatim, no shift needed. endian::load_endian_u16x8::( - a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8, + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8 ) } else { alpha_u16 diff --git a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs index f2711d7d..27376bc3 100644 --- a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs @@ -2,6 +2,20 @@ use core::arch::wasm32::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x8(v: v128) -> v128 { + if BE { + let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + u8x16_swizzle(v, mask) + } else { + v + } +} + /// WASM simd128 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → /// packed **8‑bit** RGB. /// @@ -29,7 +43,7 @@ use super::*; /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -39,7 +53,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -49,7 +63,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgba_row( +pub(crate) unsafe fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -59,7 +73,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -74,7 +88,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -115,9 +129,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = u16x8_shr(v128_load(y.as_ptr().add(x).cast()), shr); - let y_high_i16 = u16x8_shr(v128_load(y.as_ptr().add(x + 8).cast()), shr); + // BE input is byte-swapped via `load_endian_u16x8::` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr, + ); + let y_high_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr, + ); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_vec = u16x8_shr(u_vec, shr); let v_vec = u16x8_shr(v_vec, shr); @@ -174,9 +198,9 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -199,7 +223,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -208,7 +232,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -221,7 +245,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -230,7 +254,7 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -249,7 +273,11 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -293,9 +321,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr, + ); + let y_high_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr, + ); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_vec = u16x8_shr(u_vec, shr); let v_vec = u16x8_shr(v_vec, shr); @@ -352,9 +390,13 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -370,7 +412,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -380,7 +422,7 @@ pub(crate) unsafe fn p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -389,7 +431,7 @@ pub(crate) unsafe fn p16_to_rgb_row( /// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p16_to_rgba_row( +pub(crate) unsafe fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -399,7 +441,7 @@ pub(crate) unsafe fn p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -408,7 +450,7 @@ pub(crate) unsafe fn p16_to_rgba_row( /// `0xFF` alpha. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -442,9 +484,13 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_low = v128_load(y.as_ptr().add(x).cast()); - let y_high = v128_load(y.as_ptr().add(x + 8).cast()); + // BE input is byte-swapped via `load_endian_u16x8::` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_i16 = i16x8_sub(u_vec, bias16_v); let v_i16 = i16x8_sub(v_vec, bias16_v); @@ -498,9 +544,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -515,7 +561,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// 3. `y.len() >= width`, `uv_half.len() >= width`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p16_to_rgb_u16_row( +pub(crate) unsafe fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -524,7 +570,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -536,7 +582,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( /// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p16_to_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -545,7 +591,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -562,7 +608,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -600,8 +646,10 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( while x + 8 <= width { // 8 Y + 4 UV pairs (= 8 u16 = 16 bytes). Deinterleave via // `i8x16_shuffle`: [U0,V0,U1,V1,U2,V2,U3,V3] → - // [U0,U1,U2,U3, V0,V1,V2,V3]. - let y_vec = v128_load(y.as_ptr().add(x).cast()); + // [U0,U1,U2,U3, V0,V1,V2,V3]. BE input is byte-swapped via + // `load_endian_u16x8::` for Y and via `byteswap_u16x8::` + // after the deinterleave for UV. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); let uv_raw = v128_load(uv_half.as_ptr().add(x).cast()); let uv_split = i8x16_shuffle::<0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15>(uv_raw, uv_raw); @@ -614,6 +662,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( uv_split, i16x8_splat(0), ); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_i16 = i16x8_sub(u_vec, bias16); let v_i16 = i16x8_sub(v_vec, bias16); @@ -679,9 +729,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } diff --git a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs index f74e9d03..8a735757 100644 --- a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs @@ -2,6 +2,20 @@ use core::arch::wasm32::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x8(v: v128) -> v128 { + if BE { + let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); + u8x16_swizzle(v, mask) + } else { + v + } +} + // ===== Pn 4:4:4 (semi-planar high-bit-packed) → RGB ======================= // // Native wasm simd128 4:4:4 Pn kernels — combine `yuv_444p_n_to_rgb_row`'s @@ -21,7 +35,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_to_rgb_row( +pub(crate) unsafe fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -31,7 +45,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -45,7 +59,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( /// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_to_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -55,7 +69,7 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -72,7 +86,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -109,12 +127,24 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr, + ); + let y_high_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr, + ); // 32 UV elements (= 16 pairs) — two deinterleave calls. let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_vec = u16x8_shr(u_lo_vec, shr); let v_lo_vec = u16x8_shr(v_lo_vec, shr); let u_hi_vec = u16x8_shr(u_hi_vec, shr); @@ -179,9 +209,13 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgba_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgb_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -197,7 +231,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -207,7 +241,9 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } @@ -220,7 +256,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// Same as [`p_n_444_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -230,7 +266,9 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -247,7 +285,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -287,11 +329,23 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr, + ); + let y_high_i16 = u16x8_shr( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr, + ); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_vec = u16x8_shr(u_lo_vec, shr); let v_lo_vec = u16x8_shr(v_lo_vec, shr); let u_hi_vec = u16x8_shr(u_hi_vec, shr); @@ -361,11 +415,11 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + scalar::p_n_444_to_rgba_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } else { - scalar::p_n_444_to_rgb_u16_row::( + scalar::p_n_444_to_rgb_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } @@ -385,7 +439,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgb_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -395,7 +449,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -409,7 +463,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -419,7 +473,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -434,7 +488,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -467,11 +521,17 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_low = v128_load(y.as_ptr().add(x).cast()); - let y_high = v128_load(y.as_ptr().add(x + 8).cast()); + // BE input is byte-swapped via `load_endian_u16x8::` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); @@ -531,9 +591,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -549,7 +609,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -559,7 +619,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -571,7 +631,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( /// Same as [`p_n_444_16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -581,7 +641,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -597,7 +657,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -633,8 +693,12 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { // 8 Y + 8 chroma pairs (= 16 UV elements) — one deinterleave call. - let y_vec = v128_load(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x8::` for Y, + // and via `byteswap_u16x8::` after deinterleave for UV. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16_wasm(uv_full.as_ptr().add(x * 2)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_i16 = i16x8_sub(u_vec, bias16); let v_i16 = i16x8_sub(v_vec, bias16); @@ -714,9 +778,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } diff --git a/src/row/arch/wasm_simd128/tests/be_parity.rs b/src/row/arch/wasm_simd128/tests/be_parity.rs new file mode 100644 index 00000000..0218cc6a --- /dev/null +++ b/src/row/arch/wasm_simd128/tests/be_parity.rs @@ -0,0 +1,234 @@ +//! BE parity tests for WASM simd128 high-bit YUV / P-format kernels. +//! +//! Each test takes a randomized LE input buffer, byte-swaps every u16 +//! element to produce a BE-encoded buffer, then asserts that +//! `kernel::(swapped_input)` produces byte-identical output +//! to `kernel::(original_input)`. + +use super::{ + super::*, high_bit_plane_wasm, interleave_uv_wasm, p_n_packed_plane, p010_uv_interleave, + p16_plane_wasm, planar_n_plane, +}; + +fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf.iter().map(|x| x.swap_bytes()).collect() +} + +#[test] +fn simd128_yuv_420p10_be_parity_u8() { + let width = 32; + let y = planar_n_plane::<10>(width, 13); + let u = planar_n_plane::<10>(width / 2, 17); + let v = planar_n_plane::<10>(width / 2, 19); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_yuv_420p10_be_parity_u16() { + let width = 32; + let y = planar_n_plane::<10>(width, 23); + let u = planar_n_plane::<10>(width / 2, 29); + let v = planar_n_plane::<10>(width / 2, 31); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + yuv_420p_n_to_rgb_u16_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_yuv_444p12_be_parity_u8() { + let width = 32; + let y = planar_n_plane::<12>(width, 41); + let u = planar_n_plane::<12>(width, 43); + let v = planar_n_plane::<12>(width, 47); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_yuv_420p16_be_parity_u8() { + let width = 32; + let y = p16_plane_wasm(width, 53); + let u = p16_plane_wasm(width / 2, 59); + let v = p16_plane_wasm(width / 2, 61); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_yuv_444p16_be_parity_u16() { + let width = 32; + let y = p16_plane_wasm(width, 67); + let u = p16_plane_wasm(width, 71); + let v = p16_plane_wasm(width, 73); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_p010_be_parity_u8() { + let width = 32; + let y = p_n_packed_plane::<10>(width, 79); + let u_half = p_n_packed_plane::<10>(width / 2, 83); + let v_half = p_n_packed_plane::<10>(width / 2, 89); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_p410_be_parity_u8() { + let width = 32; + let y = p_n_packed_plane::<10>(width, 97); + let u_full = high_bit_plane_wasm::<10>(width, 101); + let v_full = high_bit_plane_wasm::<10>(width, 103); + let uv_full = interleave_uv_wasm(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_p016_be_parity_u8() { + let width = 32; + let y = p16_plane_wasm(width, 107); + let u_half = p16_plane_wasm(width / 2, 109); + let v_half = p16_plane_wasm(width / 2, 113); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn simd128_p416_be_parity_u16() { + let width = 16; + let y = p16_plane_wasm(width, 127); + let u_full = p16_plane_wasm(width, 131); + let v_full = p16_plane_wasm(width, 137); + let uv_full = interleave_uv_wasm(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} diff --git a/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs b/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs index ad74796c..8ef45f22 100644 --- a/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs +++ b/src/row/arch/wasm_simd128/tests/high_bit_4_2_0.rs @@ -105,9 +105,17 @@ fn check_p10_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_rang let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -130,9 +138,17 @@ fn check_p10_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_ran let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -213,9 +229,17 @@ fn check_p_n_u8_simd128_equivalence( let v = p_n_plane_simd128::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -233,9 +257,17 @@ fn check_p_n_u16_simd128_equivalence( let v = p_n_plane_simd128::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -285,9 +317,9 @@ fn check_p010_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_ran let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u8 diverges"); } @@ -299,9 +331,9 @@ fn check_p010_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_ra let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 P010→u16 diverges"); } @@ -364,9 +396,17 @@ fn check_planar_u8_simd128_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -384,9 +424,17 @@ fn check_planar_u16_simd128_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -405,9 +453,9 @@ fn check_pn_u8_simd128_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u8 diverges"); } @@ -423,9 +471,9 @@ fn check_pn_u16_simd128_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "simd128 Pn {BITS}-bit → u16 diverges"); } @@ -492,9 +540,9 @@ fn check_yuv420p16_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, ful let v = p16_plane_wasm(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -509,9 +557,9 @@ fn check_p16_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, full_rang let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -531,9 +579,17 @@ fn check_yuv420p16_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, fu let v = p16_plane_wasm(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -548,9 +604,9 @@ fn check_p16_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, full_ran let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -564,9 +620,17 @@ fn check_yuv444p16_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, fu let v = p16_plane_wasm(width, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, diff --git a/src/row/arch/wasm_simd128/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/wasm_simd128/tests/high_bit_4_4_4_and_pn.rs index 44345d9b..a5a9e77e 100644 --- a/src/row/arch/wasm_simd128/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/wasm_simd128/tests/high_bit_4_4_4_and_pn.rs @@ -20,9 +20,17 @@ fn check_planar_u8_simd128_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -41,9 +49,9 @@ fn check_pn_u8_simd128_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -61,9 +69,9 @@ fn check_yuv420p16_u8_simd128_rgba_equivalence( let v = p16_plane_wasm(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -78,9 +86,9 @@ fn check_p16_u8_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, full let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -192,7 +200,7 @@ fn check_planar_u16_simd128_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( &y, &u, &v, @@ -202,7 +210,15 @@ fn check_planar_u16_simd128_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -221,9 +237,9 @@ fn check_pn_u16_simd128_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -241,9 +257,17 @@ fn check_yuv420p16_u16_simd128_rgba_equivalence( let v = p16_plane_wasm(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -258,9 +282,9 @@ fn check_p16_u16_simd128_rgba_equivalence(width: usize, matrix: ColorMatrix, ful let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -373,9 +397,9 @@ fn check_p_n_444_u8_simd128_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -394,9 +418,16 @@ fn check_p_n_444_u16_simd128_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::( + &y, + &uv, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -411,9 +442,9 @@ fn check_p_n_444_16_u8_simd128_equivalence(width: usize, matrix: ColorMatrix, fu let uv = interleave_uv_wasm(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -428,9 +459,9 @@ fn check_p_n_444_16_u16_simd128_equivalence(width: usize, matrix: ColorMatrix, f let uv = interleave_uv_wasm(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -518,9 +549,17 @@ fn check_yuv444p_n_u8_simd128_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_wasm, @@ -539,9 +578,9 @@ fn check_pn_444_u8_simd128_rgba_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_row::(&y, &uv, &mut rgba_wasm, width, matrix, full_range); + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_wasm, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_wasm, @@ -559,9 +598,9 @@ fn check_yuv444p16_u8_simd128_rgba_equivalence( let v = p16_plane_wasm(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_wasm, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_wasm, @@ -580,9 +619,9 @@ fn check_p_n_444_16_u8_simd128_rgba_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_wasm, width, matrix, full_range); + p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_wasm, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_wasm, @@ -675,7 +714,7 @@ fn check_yuv444p16_u8_simd128_rgba_with_alpha_src_equivalence( let a_src = p16_plane_wasm(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -686,7 +725,7 @@ fn check_yuv444p16_u8_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_with_alpha_src_row( + yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/wasm_simd128/tests/mod.rs b/src/row/arch/wasm_simd128/tests/mod.rs index 0ed1c869..9a24c874 100644 --- a/src/row/arch/wasm_simd128/tests/mod.rs +++ b/src/row/arch/wasm_simd128/tests/mod.rs @@ -5,6 +5,7 @@ use super::*; use crate::row::scalar::planar_gbr_f16 as scalar_f16; mod ayuv64; +mod be_parity; mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; diff --git a/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs b/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs index 87f3feb6..3d30f972 100644 --- a/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs +++ b/src/row/arch/wasm_simd128/tests/planar_8bit_and_nv.rs @@ -516,11 +516,27 @@ fn check_yuv_444p_n_equivalence( let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_wasm = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + scalar::yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_wasm, width, matrix, full_range); - yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_wasm, width, matrix, full_range); + yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_wasm, width, matrix, full_range); + yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_wasm, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_wasm, @@ -589,9 +605,9 @@ fn check_yuv_444p16_equivalence(width: usize, matrix: ColorMatrix, full_range: b let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_wasm = std::vec![0u8; width * 3]; - scalar::yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_wasm, width, matrix, full_range); + yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_wasm, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_wasm, "simd128 yuv_444p16 u8 ≠ scalar"); // u16-output path delegates to scalar on wasm — no SIMD to compare diff --git a/src/row/arch/wasm_simd128/tests/yuva.rs b/src/row/arch/wasm_simd128/tests/yuva.rs index e9e25257..1df694fe 100644 --- a/src/row/arch/wasm_simd128/tests/yuva.rs +++ b/src/row/arch/wasm_simd128/tests/yuva.rs @@ -22,7 +22,7 @@ fn check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -33,7 +33,7 @@ fn check_yuv444p_n_u8_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_with_alpha_src_row::( + yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -206,7 +206,7 @@ fn check_yuv420p_n_u8_simd128_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -217,7 +217,7 @@ fn check_yuv420p_n_u8_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_with_alpha_src_row::( + yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -246,7 +246,7 @@ fn check_yuv420p16_u8_simd128_rgba_with_alpha_src_equivalence( let a_src = p16_plane_wasm(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_wasm = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -257,7 +257,7 @@ fn check_yuv420p16_u8_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_with_alpha_src_row( + yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -380,7 +380,7 @@ fn check_yuv444p_n_u16_simd128_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( &y, &u, &v, @@ -390,7 +390,15 @@ fn check_yuv444p_n_u16_simd128_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -409,9 +417,16 @@ fn check_pn_444_u16_simd128_rgba_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::( + &y, + &uv, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -429,9 +444,17 @@ fn check_yuv444p16_u16_simd128_rgba_equivalence( let v = p16_plane_wasm(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -450,9 +473,9 @@ fn check_p_n_444_16_u16_simd128_rgba_equivalence( let uv = interleave_uv_wasm(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -545,7 +568,7 @@ fn check_yuv444p16_u16_simd128_rgba_with_alpha_src_equivalence( let a_src = p16_plane_wasm(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -556,7 +579,7 @@ fn check_yuv444p16_u16_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_u16_with_alpha_src_row( + yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -639,7 +662,7 @@ fn check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -650,7 +673,7 @@ fn check_yuv444p_n_u16_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -781,7 +804,7 @@ fn check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_wasm = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -792,7 +815,7 @@ fn check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -821,7 +844,7 @@ fn check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence( let a_src = p16_plane_wasm(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_wasm = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -832,7 +855,7 @@ fn check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_u16_with_alpha_src_row( + yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/wasm_simd128/yuv_planar_16bit.rs b/src/row/arch/wasm_simd128/yuv_planar_16bit.rs index 61f8c803..17553867 100644 --- a/src/row/arch/wasm_simd128/yuv_planar_16bit.rs +++ b/src/row/arch/wasm_simd128/yuv_planar_16bit.rs @@ -14,7 +14,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p16_to_rgb_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -25,7 +25,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -41,7 +41,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( /// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p16_to_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -52,7 +52,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -72,7 +72,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -84,7 +84,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -114,7 +114,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -155,12 +159,13 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_lo_vec = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let u_hi_vec = endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8); + let v_lo_vec = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); + let v_hi_vec = endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8); let u_lo_i16 = i16x8_sub(u_lo_vec, bias16_v); let u_hi_i16 = i16x8_sub(u_hi_vec, bias16_v); @@ -212,8 +217,14 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit u8. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = u16x8_shr(v128_load(a_ptr.add(x).cast()), 8); - let a_hi = u16x8_shr(v128_load(a_ptr.add(x + 8).cast()), 8); + let a_lo = u16x8_shr( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + 8, + ); + let a_hi = u16x8_shr( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + 8, + ); u8x16_narrow_i16x8(a_lo, a_hi) } else { alpha_u8 @@ -234,15 +245,17 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_row( + scalar::yuv_444p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -258,7 +271,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -269,7 +282,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -283,7 +296,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// Same as [`yuv_444p16_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -294,7 +307,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -314,7 +327,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -326,7 +339,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -358,7 +371,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -401,10 +418,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row` first. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let u_vec = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let v_vec = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); let u_i16 = i16x8_sub(u_vec, bias16); let v_i16 = i16x8_sub(v_vec, bias16); @@ -480,7 +498,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8 + ) } else { alpha_u16 }; @@ -500,15 +520,15 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_u16_row( + scalar::yuv_444p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_u16_row( + scalar::yuv_444p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -518,7 +538,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -529,7 +549,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -540,7 +560,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( /// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p16_to_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -551,7 +571,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -572,7 +592,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -584,7 +604,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -607,7 +627,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -649,10 +673,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_vec = endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8); let u_i16 = i16x8_sub(u_vec, bias16_v); let v_i16 = i16x8_sub(v_vec, bias16_v); @@ -698,8 +723,14 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row> 8` to fit u8 directly. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = u16x8_shr(v128_load(a_ptr.add(x).cast()), 8); - let a_hi = u16x8_shr(v128_load(a_ptr.add(x + 8).cast()), 8); + let a_lo = u16x8_shr( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + 8, + ); + let a_hi = u16x8_shr( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + 8, + ); u8x16_narrow_i16x8(a_lo, a_hi) } else { alpha_u8 @@ -720,15 +751,17 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_row( + scalar::yuv_420p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -745,7 +778,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -755,7 +788,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -769,7 +802,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -779,7 +812,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -799,7 +832,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -811,7 +844,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -843,7 +876,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -887,10 +924,19 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row` for Y, + // and via inline `u8x16_swizzle` for the half-width U/V loads. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); // 4 U + 4 V samples = 8 bytes each. Use `v128_load64_zero` so we // don't over-read 8 bytes past the chroma plane — the public // contract only promises `u_half.len() >= width / 2`, and at @@ -898,6 +944,16 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8 + ) } else { alpha_u16 }; @@ -979,15 +1037,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_u16_row( + scalar::yuv_420p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_u16_row( + scalar::yuv_420p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/wasm_simd128/yuv_planar_high_bit.rs b/src/row/arch/wasm_simd128/yuv_planar_high_bit.rs index 2472b410..3e047ec0 100644 --- a/src/row/arch/wasm_simd128/yuv_planar_high_bit.rs +++ b/src/row/arch/wasm_simd128/yuv_planar_high_bit.rs @@ -29,7 +29,7 @@ use super::*; /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -40,7 +40,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -51,7 +51,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -62,7 +62,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -82,7 +82,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -94,7 +94,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -130,6 +130,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -178,11 +179,24 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 16 <= width { // AND‑mask each load to the low 10 bits — see matching comment - // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. - let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); - let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v); - let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v); + // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. BE input is + // byte-swapped via `load_endian_u16x8::` first. + let y_low_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = v128_and( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = v128_and( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -230,8 +244,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // alpha (e.g. 1024 at BITS=10), matching scalar. `u16x8_shr` // accepts a runtime u32 count, so `BITS - 8` works directly. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v); - let a_hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v); + let a_lo = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); let a_lo_shifted = u16x8_shr(a_lo, BITS - 8); let a_hi_shifted = u16x8_shr(a_hi, BITS - 8); u8x16_narrow_i16x8(a_lo_shifted, a_hi_shifted) @@ -255,15 +275,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_row::( + scalar::yuv_420p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_row::( + scalar::yuv_420p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -289,7 +309,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -299,7 +319,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -314,7 +334,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( /// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -324,7 +344,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -344,7 +364,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -356,7 +376,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -393,6 +413,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -444,11 +465,24 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 16 <= width { // AND‑mask loads to the low 10 bits so `chroma_i16x8`'s - // `i16x8_narrow_i32x4` stays lossless. - let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); - let u_vec = v128_and(v128_load(u_half.as_ptr().add(x / 2).cast()), mask_v); - let v_vec = v128_and(v128_load(v_half.as_ptr().add(x / 2).cast()), mask_v); + // `i16x8_narrow_i32x4` stays lossless. BE input is byte-swapped + // via `load_endian_u16x8::` first. + let y_low_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = v128_and( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = v128_and( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let u_i16 = i16x8_sub(u_vec, bias_v); let v_i16 = i16x8_sub(v_vec, bias_v); @@ -491,8 +525,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // Mask alpha loads to BITS — same hardening as Y/U/V. Native // bit depth output, so no shift. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v); - let hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v); + let lo = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -518,15 +558,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_u16_row::( + scalar::yuv_420p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -546,7 +586,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -557,7 +597,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -575,7 +615,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -586,7 +626,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -607,7 +647,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -619,7 +659,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -655,6 +695,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -701,13 +742,32 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 16 <= width { // 16 Y + 16 U + 16 V per iter. Full-width chroma (two u16x8 - // loads each) — no horizontal duplication, 4:4:4 is 1:1. - let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); - let u_lo_vec = v128_and(v128_load(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = v128_and(v128_load(u.as_ptr().add(x + 8).cast()), mask_v); - let v_lo_vec = v128_and(v128_load(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = v128_and(v128_load(v.as_ptr().add(x + 8).cast()), mask_v); + // loads each) — no horizontal duplication, 4:4:4 is 1:1. BE + // input is byte-swapped via `load_endian_u16x8::` first. + let y_low_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_vec = v128_and( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = v128_and( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_vec = v128_and( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = v128_and( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let u_lo_i16 = i16x8_sub(u_lo_vec, bias_v); let u_hi_i16 = i16x8_sub(u_hi_vec, bias_v); @@ -758,8 +818,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v); - let a_hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v); + let a_lo = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. // `u16x8_shr` accepts a runtime u32 count; `BITS - 8` is a @@ -790,15 +856,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_row::( + scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_row::( + scalar::yuv_444p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -817,7 +883,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -828,7 +894,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -846,7 +912,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// Same as [`yuv_444p_n_to_rgb_u16_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -857,7 +923,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -879,7 +945,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "simd128")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -891,7 +957,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -930,6 +996,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -978,12 +1045,31 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = v128_and(v128_load(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = v128_and(v128_load(y.as_ptr().add(x + 8).cast()), mask_v); - let u_lo_vec = v128_and(v128_load(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = v128_and(v128_load(u.as_ptr().add(x + 8).cast()), mask_v); - let v_lo_vec = v128_and(v128_load(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = v128_and(v128_load(v.as_ptr().add(x + 8).cast()), mask_v); + // BE input is byte-swapped via `load_endian_u16x8::` first. + let y_low_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = v128_and( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_vec = v128_and( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = v128_and( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_vec = v128_and( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = v128_and( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let u_lo_i16 = i16x8_sub(u_lo_vec, bias_v); let u_hi_i16 = i16x8_sub(u_hi_vec, bias_v); @@ -1033,8 +1119,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< // at the same native bit depth (BITS), so just AND-mask any // over-range bits to match the scalar reference. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = v128_and(v128_load(a_ptr.add(x).cast()), mask_v); - let hi = v128_and(v128_load(a_ptr.add(x + 8).cast()), mask_v); + let lo = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = v128_and( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -1060,15 +1152,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_u16_row::( + scalar::yuv_444p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs index 3385e8b9..677af1bc 100644 --- a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs @@ -2,6 +2,25 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_avx2` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x16(v: __m256i) -> __m256i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 32], __m256i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, + 13, 12, 15, 14, + ]) + }; + unsafe { _mm256_shuffle_epi8(v, mask) } + } else { + v + } +} + /// AVX2 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed /// **8‑bit** RGB. /// @@ -30,7 +49,7 @@ use super::*; /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -40,7 +59,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -50,7 +69,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgba_row( +pub(crate) unsafe fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -60,7 +79,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -76,7 +95,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -115,13 +134,22 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let mut x = 0usize; while x + 32 <= width { - // 32 Y = two u16×16 loads, shifted right by `16 - BITS`. - let y_low_i16 = _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x).cast()), shr_count); - let y_high_i16 = - _mm256_srl_epi16(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), shr_count); - - // 32 UV (16 pairs) — deinterleave + shift. + // 32 Y = two u16×16 loads, shifted right by `16 - BITS`. BE + // input is byte-swapped via `load_endian_u16x16::` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + shr_count, + ); + + // 32 UV (16 pairs) — deinterleave + byte-swap (for BE) + shift. let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x16::(u_vec); + let v_vec = byteswap_u16x16::(v_vec); let u_vec = _mm256_srl_epi16(u_vec, shr_count); let v_vec = _mm256_srl_epi16(v_vec, shr_count); @@ -187,9 +215,9 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -212,7 +240,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -221,7 +249,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -234,7 +262,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -243,7 +271,7 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -262,7 +290,11 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -304,10 +336,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + shr_count, + ); let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x16::(u_vec); + let v_vec = byteswap_u16x16::(v_vec); let u_vec = _mm256_srl_epi16(u_vec, shr_count); let v_vec = _mm256_srl_epi16(v_vec, shr_count); @@ -421,9 +462,13 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -439,7 +484,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -449,7 +494,7 @@ pub(crate) unsafe fn p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -458,7 +503,7 @@ pub(crate) unsafe fn p16_to_rgb_row( /// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p16_to_rgba_row( +pub(crate) unsafe fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -468,7 +513,7 @@ pub(crate) unsafe fn p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -476,7 +521,7 @@ pub(crate) unsafe fn p16_to_rgba_row( /// `ALPHA = true` uses `write_rgba_32` with constant `0xFF` alpha. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -510,11 +555,15 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 32 <= width { - let y_low = _mm256_loadu_si256(y.as_ptr().add(x).cast()); - let y_high = _mm256_loadu_si256(y.as_ptr().add(x + 16).cast()); + // BE input is byte-swapped via `load_endian_u16x16::` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8); // Deinterleave 32 UV pairs (64 u16) from uv_half[x..x+32]. // Uses the shared AVX2 deinterleave helper for Pn formats. let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x16::(u_vec); + let v_vec = byteswap_u16x16::(v_vec); let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); @@ -577,9 +626,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -593,7 +642,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// Same as [`p16_to_rgb_row`] but `rgb_out` is `&mut [u16]`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p16_to_rgb_u16_row( +pub(crate) unsafe fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -602,7 +651,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -614,7 +663,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( /// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p16_to_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -623,7 +672,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -639,7 +688,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -675,11 +724,17 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( // Load as two __m128i halves so we can reuse the SSE4.1 128-bit // byte-shuffle mask. Each half carries 4 UV pairs; we deinterleave // each to [U's | V's] and then join the two U halves / two V halves. - let split_mask_128 = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + // For BE: fold per-u16-lane byte-swap into the deinterleave mask. + let split_mask_128 = if BE { + _mm_setr_epi8(1, 0, 5, 4, 9, 8, 13, 12, 3, 2, 7, 6, 11, 10, 15, 14) + } else { + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15) + }; let mut x = 0usize; while x + 16 <= width { - let y_vec = _mm256_loadu_si256(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x16::` for Y. + let y_vec = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); // Two 128-bit UV loads: bytes [0..16) and [16..32). `x + 8` is // in u16 units (8 u16 = 16 bytes) — the second load starts at // byte offset 16, which is UV pair index 4. @@ -687,6 +742,8 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let uv_hi_raw = _mm_loadu_si128(uv_half.as_ptr().add(x + 8).cast()); // Deinterleave each half: [U0,V0,U1,V1,U2,V2,U3,V3] → // [U0,U1,U2,U3, V0,V1,V2,V3] (low 64b = U's, high 64b = V's). + // The shuffle mask above also swaps bytes within each u16 lane + // when `BE = true`. let uv_lo_split = _mm_shuffle_epi8(uv_lo_raw, split_mask_128); let uv_hi_split = _mm_shuffle_epi8(uv_hi_raw, split_mask_128); // Combine: low 64 of each → 8 U samples; high 64 of each → 8 V. @@ -790,9 +847,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } diff --git a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_4_4.rs index a11cb2da..ba350083 100644 --- a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_4_4.rs @@ -2,6 +2,25 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_avx2` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x16(v: __m256i) -> __m256i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 32], __m256i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, + 13, 12, 15, 14, + ]) + }; + unsafe { _mm256_shuffle_epi8(v, mask) } + } else { + v + } +} + // ===== Pn 4:4:4 (semi-planar high-bit-packed) → RGB ======================= // // Native AVX2 4:4:4 Pn kernels — combine `yuv_444p_n_to_rgb_row`'s @@ -21,7 +40,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgb_row( +pub(crate) unsafe fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -31,7 +50,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -45,7 +64,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( /// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -55,7 +74,7 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -72,7 +91,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -108,13 +131,24 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + shr_count, + ); // 64 UV elements (= 32 pairs) — two deinterleave calls. let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2 + 32)); + let u_lo_vec = byteswap_u16x16::(u_lo_vec); + let v_lo_vec = byteswap_u16x16::(v_lo_vec); + let u_hi_vec = byteswap_u16x16::(u_hi_vec); + let v_hi_vec = byteswap_u16x16::(v_hi_vec); let u_lo_vec = _mm256_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm256_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm256_srl_epi16(u_hi_vec, shr_count); @@ -203,9 +237,13 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgba_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgb_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -223,7 +261,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -233,7 +271,9 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } @@ -247,7 +287,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -257,7 +297,9 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -274,7 +316,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( /// 3. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -313,12 +359,23 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm256_srl_epi16( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + shr_count, + ); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2 + 32)); + let u_lo_vec = byteswap_u16x16::(u_lo_vec); + let v_lo_vec = byteswap_u16x16::(v_lo_vec); + let u_hi_vec = byteswap_u16x16::(u_hi_vec); + let v_hi_vec = byteswap_u16x16::(v_hi_vec); let u_lo_vec = _mm256_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm256_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm256_srl_epi16(u_hi_vec, shr_count); @@ -455,11 +512,11 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + scalar::p_n_444_to_rgba_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } else { - scalar::p_n_444_to_rgb_u16_row::( + scalar::p_n_444_to_rgb_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } @@ -479,7 +536,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgb_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -489,7 +546,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -503,7 +560,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -513,7 +570,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -528,7 +585,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -561,11 +618,17 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 32 <= width { - let y_low = _mm256_loadu_si256(y.as_ptr().add(x).cast()); - let y_high = _mm256_loadu_si256(y.as_ptr().add(x + 16).cast()); + // BE input is byte-swapped via `load_endian_u16x16::` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_low = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2 + 32)); + let u_lo_vec = byteswap_u16x16::(u_lo_vec); + let v_lo_vec = byteswap_u16x16::(v_lo_vec); + let u_hi_vec = byteswap_u16x16::(u_hi_vec); + let v_hi_vec = byteswap_u16x16::(v_hi_vec); let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); @@ -649,9 +712,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -668,7 +731,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -678,7 +741,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -692,7 +755,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( /// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -702,7 +765,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -717,7 +780,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -751,9 +814,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let y_vec = _mm256_loadu_si256(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x16::` for Y, + // and via `byteswap_u16x16::` after deinterleave for UV. + let y_vec = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); // 32 UV elements (= 16 pairs) — one deinterleave call. let (u_vec, v_vec) = deinterleave_uv_u16_avx2(uv_full.as_ptr().add(x * 2)); + let u_vec = byteswap_u16x16::(u_vec); + let v_vec = byteswap_u16x16::(v_vec); let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); @@ -869,9 +936,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } diff --git a/src/row/arch/x86_avx2/tests/be_parity.rs b/src/row/arch/x86_avx2/tests/be_parity.rs new file mode 100644 index 00000000..56b41e34 --- /dev/null +++ b/src/row/arch/x86_avx2/tests/be_parity.rs @@ -0,0 +1,261 @@ +//! BE parity tests for AVX2 high-bit YUV / P-format kernels. +//! +//! Each test takes a randomized LE input buffer, byte-swaps every u16 +//! element to produce a BE-encoded buffer, then asserts that +//! `kernel::(swapped_input)` produces byte-identical output +//! to `kernel::(original_input)`. + +use super::{ + super::*, high_bit_plane_avx2, interleave_uv_avx2, p_n_packed_plane, p010_uv_interleave, + p16_plane_avx2, planar_n_plane, +}; + +fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf.iter().map(|x| x.swap_bytes()).collect() +} + +#[test] +fn avx2_yuv_420p10_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = planar_n_plane::<10>(width, 13); + let u = planar_n_plane::<10>(width / 2, 17); + let v = planar_n_plane::<10>(width / 2, 19); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_yuv_420p10_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = planar_n_plane::<10>(width, 23); + let u = planar_n_plane::<10>(width / 2, 29); + let v = planar_n_plane::<10>(width / 2, 31); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + yuv_420p_n_to_rgb_u16_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_yuv_444p12_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = planar_n_plane::<12>(width, 41); + let u = planar_n_plane::<12>(width, 43); + let v = planar_n_plane::<12>(width, 47); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_yuv_420p16_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = p16_plane_avx2(width, 53); + let u = p16_plane_avx2(width / 2, 59); + let v = p16_plane_avx2(width / 2, 61); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_yuv_444p16_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = p16_plane_avx2(width, 67); + let u = p16_plane_avx2(width, 71); + let v = p16_plane_avx2(width, 73); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_p010_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = p_n_packed_plane::<10>(width, 79); + let u_half = p_n_packed_plane::<10>(width / 2, 83); + let v_half = p_n_packed_plane::<10>(width / 2, 89); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_p410_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = p_n_packed_plane::<10>(width, 97); + let u_full = high_bit_plane_avx2::<10>(width, 101); + let v_full = high_bit_plane_avx2::<10>(width, 103); + let uv_full = interleave_uv_avx2(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_p016_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 64; + let y = p16_plane_avx2(width, 107); + let u_half = p16_plane_avx2(width / 2, 109); + let v_half = p16_plane_avx2(width / 2, 113); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx2_p416_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + let width = 32; + let y = p16_plane_avx2(width, 127); + let u_full = p16_plane_avx2(width, 131); + let v_full = p16_plane_avx2(width, 137); + let uv_full = interleave_uv_avx2(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} diff --git a/src/row/arch/x86_avx2/tests/high_bit_4_2_0.rs b/src/row/arch/x86_avx2/tests/high_bit_4_2_0.rs index 85dc03cd..51c2b03f 100644 --- a/src/row/arch/x86_avx2/tests/high_bit_4_2_0.rs +++ b/src/row/arch/x86_avx2/tests/high_bit_4_2_0.rs @@ -104,9 +104,17 @@ fn check_p10_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -132,9 +140,17 @@ fn check_p10_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -218,9 +234,17 @@ fn check_p_n_u8_avx2_equivalence( let v = p_n_plane_avx2::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -241,9 +265,17 @@ fn check_p_n_u16_avx2_equivalence( let v = p_n_plane_avx2::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -298,9 +330,9 @@ fn check_p010_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u8 diverges"); } @@ -315,9 +347,9 @@ fn check_p010_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 P010→u16 diverges"); } @@ -383,9 +415,17 @@ fn check_planar_u8_avx2_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 planar {BITS}-bit → u8 diverges"); } @@ -403,9 +443,17 @@ fn check_planar_u16_avx2_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -427,9 +475,9 @@ fn check_pn_u8_avx2_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u8 diverges"); } @@ -448,9 +496,9 @@ fn check_pn_u16_avx2_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX2 Pn {BITS}-bit → u16 diverges"); } @@ -520,9 +568,9 @@ fn check_yuv420p16_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_r let v = p16_plane_avx2(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -539,9 +587,17 @@ fn check_yuv420p16_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_ let v = p16_plane_avx2(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -559,9 +615,9 @@ fn check_p16_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -579,9 +635,9 @@ fn check_p16_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full_range: let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, diff --git a/src/row/arch/x86_avx2/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/x86_avx2/tests/high_bit_4_4_4_and_pn.rs index 20433f6a..3bb1374d 100644 --- a/src/row/arch/x86_avx2/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/x86_avx2/tests/high_bit_4_4_4_and_pn.rs @@ -20,9 +20,17 @@ fn check_planar_u8_avx2_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -41,9 +49,9 @@ fn check_pn_u8_avx2_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -57,9 +65,9 @@ fn check_yuv420p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, f let v = p16_plane_avx2(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -74,9 +82,9 @@ fn check_p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ra let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -206,7 +214,7 @@ fn check_planar_u16_avx2_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( &y, &u, &v, @@ -216,7 +224,15 @@ fn check_planar_u16_avx2_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -235,9 +251,9 @@ fn check_pn_u16_avx2_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -251,9 +267,17 @@ fn check_yuv420p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_avx2(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -268,9 +292,9 @@ fn check_p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, full_r let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -404,9 +428,9 @@ fn check_p_n_444_u8_avx2_equivalence( let uv = interleave_uv_avx2(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -428,9 +452,16 @@ fn check_p_n_444_u16_avx2_equivalence( let uv = interleave_uv_avx2(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::( + &y, + &uv, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -448,9 +479,9 @@ fn check_p_n_444_16_u8_avx2_equivalence(width: usize, matrix: ColorMatrix, full_ let uv = interleave_uv_avx2(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -468,9 +499,9 @@ fn check_p_n_444_16_u16_avx2_equivalence(width: usize, matrix: ColorMatrix, full let uv = interleave_uv_avx2(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -558,9 +589,17 @@ fn check_yuv444p_n_u8_avx2_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -579,9 +618,9 @@ fn check_pn_444_u8_avx2_rgba_equivalence( let uv = interleave_uv_avx2(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -595,9 +634,9 @@ fn check_yuv444p16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, f let v = p16_plane_avx2(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -612,9 +651,9 @@ fn check_p_n_444_16_u8_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, let uv = interleave_uv_avx2(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -722,7 +761,7 @@ fn check_yuv444p16_u8_avx2_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx2(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -733,7 +772,7 @@ fn check_yuv444p16_u8_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_with_alpha_src_row( + yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_avx2/tests/mod.rs b/src/row/arch/x86_avx2/tests/mod.rs index fcb6df4b..94ed7cc0 100644 --- a/src/row/arch/x86_avx2/tests/mod.rs +++ b/src/row/arch/x86_avx2/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod be_parity; mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; diff --git a/src/row/arch/x86_avx2/tests/planar_8bit_and_nv.rs b/src/row/arch/x86_avx2/tests/planar_8bit_and_nv.rs index c41ca743..252299fc 100644 --- a/src/row/arch/x86_avx2/tests/planar_8bit_and_nv.rs +++ b/src/row/arch/x86_avx2/tests/planar_8bit_and_nv.rs @@ -665,11 +665,27 @@ fn check_yuv_444p_n_equivalence( let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_avx2 = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + scalar::yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_avx2, width, matrix, full_range); - yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_avx2, width, matrix, full_range); + yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_avx2, width, matrix, full_range); + yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_avx2, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_avx2, "AVX2 yuv_444p_n<{BITS}> u8 ≠ scalar"); assert_eq!(u16_scalar, u16_avx2, "AVX2 yuv_444p_n<{BITS}> u16 ≠ scalar"); @@ -749,11 +765,19 @@ fn check_yuv_444p16_equivalence(width: usize, matrix: ColorMatrix, full_range: b let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_avx2 = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_avx2, width, matrix, full_range); - yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_avx2, width, matrix, full_range); + yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_avx2, width, matrix, full_range); + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut u16_avx2, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_avx2, "AVX2 yuv_444p16 u8 ≠ scalar"); assert_eq!(u16_scalar, u16_avx2, "AVX2 yuv_444p16 u16 ≠ scalar"); diff --git a/src/row/arch/x86_avx2/tests/yuva.rs b/src/row/arch/x86_avx2/tests/yuva.rs index 58536c36..e3026360 100644 --- a/src/row/arch/x86_avx2/tests/yuva.rs +++ b/src/row/arch/x86_avx2/tests/yuva.rs @@ -21,7 +21,7 @@ fn check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -32,7 +32,7 @@ fn check_yuv444p_n_u8_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_with_alpha_src_row::( + yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -200,7 +200,7 @@ fn check_yuv420p_n_u8_avx2_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -211,7 +211,7 @@ fn check_yuv420p_n_u8_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_with_alpha_src_row::( + yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -240,7 +240,7 @@ fn check_yuv420p16_u8_avx2_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx2(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -251,7 +251,7 @@ fn check_yuv420p16_u8_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_with_alpha_src_row( + yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -377,7 +377,7 @@ fn check_yuv444p_n_u16_avx2_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( &y, &u, &v, @@ -387,7 +387,15 @@ fn check_yuv444p_n_u16_avx2_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -406,9 +414,16 @@ fn check_pn_444_u16_avx2_rgba_equivalence( let uv = interleave_uv_avx2(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::( + &y, + &uv, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -422,9 +437,17 @@ fn check_yuv444p16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_avx2(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -439,9 +462,9 @@ fn check_p_n_444_16_u16_avx2_rgba_equivalence(width: usize, matrix: ColorMatrix, let uv = interleave_uv_avx2(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -549,7 +572,7 @@ fn check_yuv444p16_u16_avx2_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx2(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -560,7 +583,7 @@ fn check_yuv444p16_u16_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_u16_with_alpha_src_row( + yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -653,7 +676,7 @@ fn check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -664,7 +687,7 @@ fn check_yuv444p_n_u16_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -797,7 +820,7 @@ fn check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -808,7 +831,7 @@ fn check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -837,7 +860,7 @@ fn check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx2(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -848,7 +871,7 @@ fn check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_u16_with_alpha_src_row( + yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_avx2/yuv_planar_16bit.rs b/src/row/arch/x86_avx2/yuv_planar_16bit.rs index 720c7dc7..22dbdea4 100644 --- a/src/row/arch/x86_avx2/yuv_planar_16bit.rs +++ b/src/row/arch/x86_avx2/yuv_planar_16bit.rs @@ -15,7 +15,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p16_to_rgb_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -26,7 +26,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -43,7 +43,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( /// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p16_to_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -54,7 +54,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -75,7 +75,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -87,7 +87,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -118,7 +118,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -159,12 +163,13 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8); + let u_lo_vec = endian::load_endian_u16x16::(u.as_ptr().add(x) as *const u8); + let u_hi_vec = endian::load_endian_u16x16::(u.as_ptr().add(x + 16) as *const u8); + let v_lo_vec = endian::load_endian_u16x16::(v.as_ptr().add(x) as *const u8); + let v_hi_vec = endian::load_endian_u16x16::(v.as_ptr().add(x + 16) as *const u8); let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias16_v); @@ -240,8 +245,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit u8. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm256_srli_epi16::<8>(_mm256_loadu_si256(a_ptr.add(x).cast())); - let a_hi = _mm256_srli_epi16::<8>(_mm256_loadu_si256(a_ptr.add(x + 16).cast())); + let a_lo = + _mm256_srli_epi16::<8>(endian::load_endian_u16x16::(a_ptr.add(x) as *const u8)); + let a_hi = _mm256_srli_epi16::<8>(endian::load_endian_u16x16::( + a_ptr.add(x + 16) as *const u8 + )); narrow_u8x32(a_lo, a_hi) } else { alpha_u8 @@ -262,15 +270,17 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_row( + scalar::yuv_444p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -289,7 +299,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -300,7 +310,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -316,7 +326,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -327,7 +337,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -347,7 +357,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -359,7 +369,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -390,7 +400,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -431,10 +445,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row` first. + let y_vec = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); + let u_vec = endian::load_endian_u16x16::(u.as_ptr().add(x) as *const u8); + let v_vec = endian::load_endian_u16x16::(v.as_ptr().add(x) as *const u8); let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); @@ -519,7 +534,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8, + ); ( _mm256_castsi256_si128(a_vec), _mm256_extracti128_si256::<1>(a_vec), @@ -570,15 +587,15 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_u16_row( + scalar::yuv_444p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_u16_row( + scalar::yuv_444p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -598,7 +615,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -609,7 +626,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -620,7 +637,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( /// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p16_to_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -631,7 +648,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -651,7 +668,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -663,7 +680,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -686,7 +703,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -728,10 +749,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8); + let u_vec = endian::load_endian_u16x16::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x16::(v_half.as_ptr().add(x / 2) as *const u8); let u_i16 = _mm256_sub_epi16(u_vec, bias16_v); let v_i16 = _mm256_sub_epi16(v_vec, bias16_v); @@ -787,8 +809,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row> 8` to fit u8. // `_mm256_srli_epi16::<8>` accepts a const literal shift. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm256_srli_epi16::<8>(_mm256_loadu_si256(a_ptr.add(x).cast())); - let a_hi = _mm256_srli_epi16::<8>(_mm256_loadu_si256(a_ptr.add(x + 16).cast())); + let a_lo = + _mm256_srli_epi16::<8>(endian::load_endian_u16x16::(a_ptr.add(x) as *const u8)); + let a_hi = _mm256_srli_epi16::<8>(endian::load_endian_u16x16::( + a_ptr.add(x + 16) as *const u8 + )); narrow_u8x32(a_lo, a_hi) } else { alpha_u8 @@ -809,15 +834,17 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_row( + scalar::yuv_420p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -834,7 +861,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -844,7 +871,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -858,7 +885,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -868,7 +895,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -887,7 +914,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -899,7 +926,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -931,7 +958,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -971,12 +1002,29 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row` for Y and via + // inline `_mm_shuffle_epi8` for the half-width U/V loads. + let y_vec = endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8); let u_vec_128 = _mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()); let v_vec_128 = _mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()); + let u_vec_128 = if BE { + _mm_shuffle_epi8(u_vec_128, bswap_u16) + } else { + u_vec_128 + }; + let v_vec_128 = if BE { + _mm_shuffle_epi8(v_vec_128, bswap_u16) + } else { + v_vec_128 + }; // Center UV via wrapping `-(-32768)` trick. let bias16_128 = _mm256_castsi256_si128(bias16_v); @@ -1052,7 +1100,9 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8, + ); ( _mm256_castsi256_si128(a_vec), _mm256_extracti128_si256::<1>(a_vec), @@ -1103,15 +1153,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_u16_row( + scalar::yuv_420p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_u16_row( + scalar::yuv_420p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_avx2/yuv_planar_high_bit.rs b/src/row/arch/x86_avx2/yuv_planar_high_bit.rs index 8e5deb3e..001a22ee 100644 --- a/src/row/arch/x86_avx2/yuv_planar_high_bit.rs +++ b/src/row/arch/x86_avx2/yuv_planar_high_bit.rs @@ -31,7 +31,7 @@ use super::*; /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -42,7 +42,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -53,7 +53,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -64,7 +64,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -84,7 +84,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -96,7 +96,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -131,6 +131,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -180,14 +181,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // 32 Y = two `_mm256_loadu_si256` (16 u16 each). U/V each = one // load of 16 u16. AND‑mask each load to the low 10 bits — see // matching comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. - let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); + // BE input is byte-swapped via `load_endian_u16x16::` first. + let y_low_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + mask_v, + ); let u_vec = _mm256_and_si256( - _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x16::(u_half.as_ptr().add(x / 2) as *const u8), mask_v, ); let v_vec = _mm256_and_si256( - _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x16::(v_half.as_ptr().add(x / 2) as *const u8), mask_v, ); @@ -248,8 +256,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // generic shift (not stable for `BITS - 8`); use // `_mm256_srl_epi16` with a count vector built from `BITS-8`. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v); + let a_lo = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x + 16) as *const u8), + mask_v, + ); let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); let a_lo_shifted = _mm256_srl_epi16(a_lo, a_shr); let a_hi_shifted = _mm256_srl_epi16(a_hi, a_shr); @@ -277,15 +291,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_row::( + scalar::yuv_420p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_row::( + scalar::yuv_420p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -317,7 +331,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -327,7 +341,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -342,7 +356,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( /// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -352,7 +366,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -372,7 +386,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -384,7 +398,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -421,6 +435,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -471,15 +486,22 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 32 <= width { // AND‑mask loads to the low 10 bits so `chroma_i16x16`'s - // `_mm256_packs_epi32` narrow stays lossless. - let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); + // `_mm256_packs_epi32` narrow stays lossless. BE input is + // byte-swapped via `load_endian_u16x16::` first. + let y_low_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + mask_v, + ); let u_vec = _mm256_and_si256( - _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x16::(u_half.as_ptr().add(x / 2) as *const u8), mask_v, ); let v_vec = _mm256_and_si256( - _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x16::(v_half.as_ptr().add(x / 2) as *const u8), mask_v, ); @@ -538,8 +560,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // bit depth output, so no shift; just split each 256-bit // load into two 128-bit halves to feed `write_rgba_u16_8`. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v); + let a_lo = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x + 16) as *const u8), + mask_v, + ); ( _mm256_castsi256_si128(a_lo), _mm256_extracti128_si256::<1>(a_lo), @@ -618,15 +646,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_u16_row::( + scalar::yuv_420p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -645,7 +673,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -656,7 +684,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -674,7 +702,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -685,7 +713,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -705,7 +733,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -717,7 +745,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -753,6 +781,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -799,13 +828,32 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 32 <= width { // 32 Y + 32 U + 32 V per iter. Full-width chroma (two 16-u16 - // loads each) — no horizontal duplication, 4:4:4 is 1:1. - let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); - let u_lo_vec = _mm256_and_si256(_mm256_loadu_si256(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm256_and_si256(_mm256_loadu_si256(u.as_ptr().add(x + 16).cast()), mask_v); - let v_lo_vec = _mm256_and_si256(_mm256_loadu_si256(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm256_and_si256(_mm256_loadu_si256(v.as_ptr().add(x + 16).cast()), mask_v); + // loads each) — no horizontal duplication, 4:4:4 is 1:1. BE + // input is byte-swapped via `load_endian_u16x16::` first. + let y_low_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + mask_v, + ); + let u_lo_vec = _mm256_and_si256( + endian::load_endian_u16x16::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm256_and_si256( + endian::load_endian_u16x16::(u.as_ptr().add(x + 16) as *const u8), + mask_v, + ); + let v_lo_vec = _mm256_and_si256( + endian::load_endian_u16x16::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm256_and_si256( + endian::load_endian_u16x16::(v.as_ptr().add(x + 16) as *const u8), + mask_v, + ); let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias_v); @@ -881,8 +929,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v); + let a_lo = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x + 16) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. AVX2's // `_mm256_srli_epi16::` requires a literal shift, so @@ -914,15 +968,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_row::( + scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_row::( + scalar::yuv_444p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -941,7 +995,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -952,7 +1006,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -970,7 +1024,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -981,7 +1035,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -1002,7 +1056,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx2")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -1014,7 +1068,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -1053,6 +1107,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -1101,12 +1156,31 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 32 <= width { - let y_low_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm256_and_si256(_mm256_loadu_si256(y.as_ptr().add(x + 16).cast()), mask_v); - let u_lo_vec = _mm256_and_si256(_mm256_loadu_si256(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm256_and_si256(_mm256_loadu_si256(u.as_ptr().add(x + 16).cast()), mask_v); - let v_lo_vec = _mm256_and_si256(_mm256_loadu_si256(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm256_and_si256(_mm256_loadu_si256(v.as_ptr().add(x + 16).cast()), mask_v); + // BE input is byte-swapped via `load_endian_u16x16::` first. + let y_low_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm256_and_si256( + endian::load_endian_u16x16::(y.as_ptr().add(x + 16) as *const u8), + mask_v, + ); + let u_lo_vec = _mm256_and_si256( + endian::load_endian_u16x16::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm256_and_si256( + endian::load_endian_u16x16::(u.as_ptr().add(x + 16) as *const u8), + mask_v, + ); + let v_lo_vec = _mm256_and_si256( + endian::load_endian_u16x16::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm256_and_si256( + endian::load_endian_u16x16::(v.as_ptr().add(x + 16) as *const u8), + mask_v, + ); let u_lo_i16 = _mm256_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm256_sub_epi16(u_hi_vec, bias_v); @@ -1182,8 +1256,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< // 128-bit quarters consumed by the four `write_rgba_u16_8` // calls per iter (mirroring the R/G/B cast/extract pattern). let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_v = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x).cast()), mask_v); - let a_hi_v = _mm256_and_si256(_mm256_loadu_si256(a_ptr.add(x + 16).cast()), mask_v); + let a_lo_v = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi_v = _mm256_and_si256( + endian::load_endian_u16x16::(a_ptr.add(x + 16) as *const u8), + mask_v, + ); ( _mm256_castsi256_si128(a_lo_v), _mm256_extracti128_si256::<1>(a_lo_v), @@ -1262,15 +1342,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_u16_row::( + scalar::yuv_444p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs index 46700f74..dbf38db2 100644 --- a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs @@ -2,6 +2,26 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_avx512` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x32(v: __m512i) -> __m512i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 64], __m512i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, + 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, + 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm512_shuffle_epi8(v, mask) } + } else { + v + } +} + /// AVX‑512 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed /// **8‑bit** RGB. /// @@ -31,7 +51,7 @@ use super::*; /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -41,7 +61,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -51,7 +71,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgba_row( +pub(crate) unsafe fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -61,7 +81,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -76,7 +96,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -119,10 +139,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x).cast()), shr_count); - let y_high_i16 = - _mm512_srl_epi16(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), shr_count); + // BE input is byte-swapped via `load_endian_u16x32::` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + shr_count, + ); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x32::(u_vec); + let v_vec = byteswap_u16x32::(v_vec); let u_vec = _mm512_srl_epi16(u_vec, shr_count); let v_vec = _mm512_srl_epi16(v_vec, shr_count); @@ -188,9 +217,9 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -213,7 +242,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -222,7 +251,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -235,7 +264,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -244,7 +273,7 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -263,7 +292,11 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -309,10 +342,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + shr_count, + ); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x32::(u_vec); + let v_vec = byteswap_u16x32::(v_vec); let u_vec = _mm512_srl_epi16(u_vec, shr_count); let v_vec = _mm512_srl_epi16(v_vec, shr_count); @@ -390,9 +432,13 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -408,7 +454,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -418,7 +464,7 @@ pub(crate) unsafe fn p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -427,7 +473,7 @@ pub(crate) unsafe fn p16_to_rgb_row( /// Thin wrapper over [`p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p16_to_rgba_row( +pub(crate) unsafe fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -437,7 +483,7 @@ pub(crate) unsafe fn p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -445,7 +491,7 @@ pub(crate) unsafe fn p16_to_rgba_row( /// `ALPHA = true` uses `write_rgba_64` with constant `0xFF` alpha. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -482,9 +528,13 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 64 <= width { - let y_low = _mm512_loadu_si512(y.as_ptr().add(x).cast()); - let y_high = _mm512_loadu_si512(y.as_ptr().add(x + 32).cast()); + // BE input is byte-swapped via `load_endian_u16x32::` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x32::(u_vec); + let v_vec = byteswap_u16x32::(v_vec); let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); @@ -547,9 +597,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -562,7 +612,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// Same as [`p16_to_rgb_row`] but `rgb_out` is `&mut [u16]`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p16_to_rgb_u16_row( +pub(crate) unsafe fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -571,7 +621,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -583,7 +633,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( /// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p16_to_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -592,7 +642,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -611,7 +661,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -656,7 +706,13 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( // Per-128-bit-lane shuffle to deinterleave u16 UV pairs within // each lane: `[u0,v0,u1,v1,u2,v2,u3,v3] → [u0,u1,u2,u3,v0,v1,v2,v3]` // as u16 = byte indices `[0,1, 4,5, 8,9, 12,13 | 2,3, 6,7, 10,11, 14,15]`. - let uv_lane_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + // For BE: also swap the two bytes within each u16 lane during the + // deinterleave shuffle. + let uv_lane_mask = if BE { + _mm_setr_epi8(1, 0, 5, 4, 9, 8, 13, 12, 3, 2, 7, 6, 11, 10, 15, 14) + } else { + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15) + }; let uv_deint_mask = _mm512_broadcast_i32x4(uv_lane_mask); // After the per-lane shuffle the 64-bit lane layout is // `[U0_3, V0_3, U4_7, V4_7, U8_11, V8_11, U12_15, V12_15]`; permute @@ -666,7 +722,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { - let y_vec = _mm512_loadu_si512(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x32::` for Y. + // The UV byte-swap is folded into the deinterleave shuffle mask. + let y_vec = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); // 16 UV pairs = 32 u16 = one 512-bit load. let uv_raw = _mm512_loadu_si512(uv_half.as_ptr().add(x).cast()); let uv_deint = _mm512_shuffle_epi8(uv_raw, uv_deint_mask); @@ -745,9 +803,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } diff --git a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_4_4.rs index a0484273..147dcb8d 100644 --- a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_4_4.rs @@ -2,6 +2,26 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16_avx512` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x32(v: __m512i) -> __m512i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 64], __m512i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, + 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, + 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm512_shuffle_epi8(v, mask) } + } else { + v + } +} + // ===== Pn 4:4:4 (semi-planar high-bit-packed) → RGB ======================= // // Native AVX-512 4:4:4 Pn kernels — combine `yuv_444p_n_to_rgb_row`'s @@ -25,7 +45,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_to_rgb_row( +pub(crate) unsafe fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -35,7 +55,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -49,7 +69,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( /// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_to_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -59,7 +79,7 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -76,7 +96,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -114,13 +138,24 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + shr_count, + ); // 128 UV elements (= 64 pairs) per iter — two deinterleave calls. let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2 + 64)); + let u_lo_vec = byteswap_u16x32::(u_lo_vec); + let v_lo_vec = byteswap_u16x32::(v_lo_vec); + let u_hi_vec = byteswap_u16x32::(u_hi_vec); + let v_hi_vec = byteswap_u16x32::(v_hi_vec); let u_lo_vec = _mm512_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm512_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm512_srl_epi16(u_hi_vec, shr_count); @@ -221,9 +256,13 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgba_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgb_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -239,7 +278,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -249,7 +288,9 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } @@ -263,7 +304,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -273,7 +314,9 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -290,7 +333,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( /// 3. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -331,12 +378,23 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm512_srl_epi16( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + shr_count, + ); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2 + 64)); + let u_lo_vec = byteswap_u16x32::(u_lo_vec); + let v_lo_vec = byteswap_u16x32::(v_lo_vec); + let u_hi_vec = byteswap_u16x32::(u_hi_vec); + let v_hi_vec = byteswap_u16x32::(v_hi_vec); let u_lo_vec = _mm512_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm512_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm512_srl_epi16(u_hi_vec, shr_count); @@ -449,11 +507,11 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + scalar::p_n_444_to_rgba_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } else { - scalar::p_n_444_to_rgb_u16_row::( + scalar::p_n_444_to_rgb_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } @@ -473,7 +531,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row= 3 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgb_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -483,7 +541,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -497,7 +555,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -507,7 +565,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -522,7 +580,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -557,11 +615,17 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 64 <= width { - let y_low = _mm512_loadu_si512(y.as_ptr().add(x).cast()); - let y_high = _mm512_loadu_si512(y.as_ptr().add(x + 32).cast()); + // BE input is byte-swapped via `load_endian_u16x32::` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_low = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2 + 64)); + let u_lo_vec = byteswap_u16x32::(u_lo_vec); + let v_lo_vec = byteswap_u16x32::(v_lo_vec); + let u_hi_vec = byteswap_u16x32::(u_hi_vec); + let v_hi_vec = byteswap_u16x32::(v_hi_vec); let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); @@ -657,9 +721,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -676,7 +740,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -686,7 +750,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -700,7 +764,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( /// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -710,7 +774,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -725,7 +789,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "avx512bw,avx512f")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -764,8 +828,12 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { // 32 pixels per iter — one deinterleave (64 UV elements = 32 pairs). - let y_vec = _mm512_loadu_si512(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x32::` for Y, + // and via `byteswap_u16x32::` after deinterleave for UV. + let y_vec = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16_avx512(uv_full.as_ptr().add(x * 2)); + let u_vec = byteswap_u16x32::(u_vec); + let v_vec = byteswap_u16x32::(v_vec); let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); @@ -853,9 +921,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } diff --git a/src/row/arch/x86_avx512/tests/be_parity.rs b/src/row/arch/x86_avx512/tests/be_parity.rs new file mode 100644 index 00000000..005de034 --- /dev/null +++ b/src/row/arch/x86_avx512/tests/be_parity.rs @@ -0,0 +1,265 @@ +//! BE parity tests for AVX-512 high-bit YUV / P-format kernels. +//! +//! Each test takes a randomized LE input buffer, byte-swaps every u16 +//! element to produce a BE-encoded buffer, then asserts that +//! `kernel::(swapped_input)` produces byte-identical output +//! to `kernel::(original_input)`. + +use super::{ + super::*, high_bit_plane_avx512, interleave_uv_avx512, p_n_packed_plane, p010_uv_interleave, + p16_plane_avx512, planar_n_plane, +}; + +fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf.iter().map(|x| x.swap_bytes()).collect() +} + +fn avx512_available() -> bool { + std::arch::is_x86_feature_detected!("avx512f") && std::arch::is_x86_feature_detected!("avx512bw") +} + +#[test] +fn avx512_yuv_420p10_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = planar_n_plane::<10>(width, 13); + let u = planar_n_plane::<10>(width / 2, 17); + let v = planar_n_plane::<10>(width / 2, 19); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_yuv_420p10_be_parity_u16() { + if !avx512_available() { + return; + } + let width = 128; + let y = planar_n_plane::<10>(width, 23); + let u = planar_n_plane::<10>(width / 2, 29); + let v = planar_n_plane::<10>(width / 2, 31); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + yuv_420p_n_to_rgb_u16_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_yuv_444p12_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = planar_n_plane::<12>(width, 41); + let u = planar_n_plane::<12>(width, 43); + let v = planar_n_plane::<12>(width, 47); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_yuv_420p16_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = p16_plane_avx512(width, 53); + let u = p16_plane_avx512(width / 2, 59); + let v = p16_plane_avx512(width / 2, 61); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_yuv_444p16_be_parity_u16() { + if !avx512_available() { + return; + } + let width = 64; + let y = p16_plane_avx512(width, 67); + let u = p16_plane_avx512(width, 71); + let v = p16_plane_avx512(width, 73); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_p010_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = p_n_packed_plane::<10>(width, 79); + let u_half = p_n_packed_plane::<10>(width / 2, 83); + let v_half = p_n_packed_plane::<10>(width / 2, 89); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_p410_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = p_n_packed_plane::<10>(width, 97); + let u_full = high_bit_plane_avx512::<10>(width, 101); + let v_full = high_bit_plane_avx512::<10>(width, 103); + let uv_full = interleave_uv_avx512(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_p016_be_parity_u8() { + if !avx512_available() { + return; + } + let width = 128; + let y = p16_plane_avx512(width, 107); + let u_half = p16_plane_avx512(width / 2, 109); + let v_half = p16_plane_avx512(width / 2, 113); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn avx512_p416_be_parity_u16() { + if !avx512_available() { + return; + } + let width = 64; + let y = p16_plane_avx512(width, 127); + let u_full = p16_plane_avx512(width, 131); + let v_full = p16_plane_avx512(width, 137); + let uv_full = interleave_uv_avx512(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} diff --git a/src/row/arch/x86_avx512/tests/high_bit_4_2_0.rs b/src/row/arch/x86_avx512/tests/high_bit_4_2_0.rs index 8bc8ff4c..9a84c555 100644 --- a/src/row/arch/x86_avx512/tests/high_bit_4_2_0.rs +++ b/src/row/arch/x86_avx512/tests/high_bit_4_2_0.rs @@ -104,9 +104,17 @@ fn check_p10_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -132,9 +140,17 @@ fn check_p10_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_rang let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -218,9 +234,17 @@ fn check_p_n_u8_avx512_equivalence( let v = p_n_plane_avx512::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -241,9 +265,17 @@ fn check_p_n_u16_avx512_equivalence( let v = p_n_plane_avx512::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -298,9 +330,9 @@ fn check_p010_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_rang let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u8 diverges"); } @@ -315,9 +347,9 @@ fn check_p010_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_ran let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 P010→u16 diverges"); } @@ -383,9 +415,17 @@ fn check_planar_u8_avx512_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -406,9 +446,17 @@ fn check_planar_u16_avx512_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -430,9 +478,9 @@ fn check_pn_u8_avx512_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u8 diverges"); } @@ -451,9 +499,9 @@ fn check_pn_u16_avx512_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "AVX-512 Pn {BITS}-bit → u16 diverges"); } @@ -523,9 +571,9 @@ fn check_yuv420p16_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full let v = p16_plane_avx512(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -542,9 +590,17 @@ fn check_yuv420p16_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, ful let v = p16_plane_avx512(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -562,9 +618,9 @@ fn check_p16_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, full_range let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -582,9 +638,9 @@ fn check_p16_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, full_rang let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, diff --git a/src/row/arch/x86_avx512/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/x86_avx512/tests/high_bit_4_4_4_and_pn.rs index 2e9e62f8..ef6b67e4 100644 --- a/src/row/arch/x86_avx512/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/x86_avx512/tests/high_bit_4_4_4_and_pn.rs @@ -20,9 +20,17 @@ fn check_planar_u8_avx512_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -41,9 +49,9 @@ fn check_pn_u8_avx512_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -57,9 +65,9 @@ fn check_yuv420p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_avx512(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -74,9 +82,9 @@ fn check_p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -206,7 +214,7 @@ fn check_planar_u16_avx512_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( &y, &u, &v, @@ -216,7 +224,15 @@ fn check_planar_u16_avx512_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -235,9 +251,9 @@ fn check_pn_u16_avx512_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -255,9 +271,17 @@ fn check_yuv420p16_u16_avx512_rgba_equivalence( let v = p16_plane_avx512(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -272,9 +296,9 @@ fn check_p16_u16_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, full let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -408,9 +432,9 @@ fn check_p_n_444_u8_avx512_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -432,9 +456,16 @@ fn check_p_n_444_u16_avx512_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::( + &y, + &uv, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -452,9 +483,9 @@ fn check_p_n_444_16_u8_avx512_equivalence(width: usize, matrix: ColorMatrix, ful let uv = interleave_uv_avx512(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -472,9 +503,9 @@ fn check_p_n_444_16_u16_avx512_equivalence(width: usize, matrix: ColorMatrix, fu let uv = interleave_uv_avx512(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -564,9 +595,17 @@ fn check_yuv444p_n_u8_avx512_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -585,9 +624,9 @@ fn check_pn_444_u8_avx512_rgba_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -601,9 +640,9 @@ fn check_yuv444p16_u8_avx512_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane_avx512(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -622,9 +661,9 @@ fn check_p_n_444_16_u8_avx512_rgba_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -732,7 +771,7 @@ fn check_yuv444p16_u8_avx512_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx512(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -743,7 +782,7 @@ fn check_yuv444p16_u8_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_with_alpha_src_row( + yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_avx512/tests/mod.rs b/src/row/arch/x86_avx512/tests/mod.rs index de1c57ad..9ef8affd 100644 --- a/src/row/arch/x86_avx512/tests/mod.rs +++ b/src/row/arch/x86_avx512/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod be_parity; mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; diff --git a/src/row/arch/x86_avx512/tests/planar_8bit_and_nv.rs b/src/row/arch/x86_avx512/tests/planar_8bit_and_nv.rs index 6887b339..ed6f10f5 100644 --- a/src/row/arch/x86_avx512/tests/planar_8bit_and_nv.rs +++ b/src/row/arch/x86_avx512/tests/planar_8bit_and_nv.rs @@ -666,11 +666,35 @@ fn check_yuv_444p_n_equivalence( let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_avx512 = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + scalar::yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_avx512, width, matrix, full_range); - yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_avx512, width, matrix, full_range); + yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_avx512, width, matrix, full_range); + yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_avx512, + width, + matrix, + full_range, + ); } assert_eq!( rgb_scalar, rgb_avx512, @@ -756,11 +780,19 @@ fn check_yuv_444p16_equivalence(width: usize, matrix: ColorMatrix, full_range: b let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_avx512 = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_avx512, width, matrix, full_range); - yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_avx512, width, matrix, full_range); + yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_avx512, width, matrix, full_range); + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut u16_avx512, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_avx512, "AVX-512 yuv_444p16 u8 ≠ scalar"); assert_eq!(u16_scalar, u16_avx512, "AVX-512 yuv_444p16 u16 ≠ scalar"); diff --git a/src/row/arch/x86_avx512/tests/yuva.rs b/src/row/arch/x86_avx512/tests/yuva.rs index 5e11edd8..8bbd0bce 100644 --- a/src/row/arch/x86_avx512/tests/yuva.rs +++ b/src/row/arch/x86_avx512/tests/yuva.rs @@ -23,7 +23,7 @@ fn check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -34,7 +34,7 @@ fn check_yuv444p_n_u8_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_with_alpha_src_row::( + yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -222,7 +222,7 @@ fn check_yuv420p_n_u8_avx512_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -233,7 +233,7 @@ fn check_yuv420p_n_u8_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_with_alpha_src_row::( + yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -262,7 +262,7 @@ fn check_yuv420p16_u8_avx512_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx512(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -273,7 +273,7 @@ fn check_yuv420p16_u8_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_with_alpha_src_row( + yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -414,7 +414,7 @@ fn check_yuv444p_n_u16_avx512_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( &y, &u, &v, @@ -424,7 +424,15 @@ fn check_yuv444p_n_u16_avx512_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -443,9 +451,16 @@ fn check_pn_444_u16_avx512_rgba_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::( + &y, + &uv, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -463,9 +478,17 @@ fn check_yuv444p16_u16_avx512_rgba_equivalence( let v = p16_plane_avx512(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -484,9 +507,9 @@ fn check_p_n_444_16_u16_avx512_rgba_equivalence( let uv = interleave_uv_avx512(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -594,7 +617,7 @@ fn check_yuv444p16_u16_avx512_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx512(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -605,7 +628,7 @@ fn check_yuv444p16_u16_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_u16_with_alpha_src_row( + yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -699,7 +722,7 @@ fn check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -710,7 +733,7 @@ fn check_yuv444p_n_u16_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -853,7 +876,7 @@ fn check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -864,7 +887,7 @@ fn check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -893,7 +916,7 @@ fn check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence( let a_src = p16_plane_avx512(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -904,7 +927,7 @@ fn check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_u16_with_alpha_src_row( + yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_avx512/yuv_planar_16bit.rs b/src/row/arch/x86_avx512/yuv_planar_16bit.rs index 8cb6e56f..25c3c3b5 100644 --- a/src/row/arch/x86_avx512/yuv_planar_16bit.rs +++ b/src/row/arch/x86_avx512/yuv_planar_16bit.rs @@ -15,7 +15,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p16_to_rgb_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -26,7 +26,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -42,7 +42,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( /// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p16_to_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -53,7 +53,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -73,7 +73,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -85,7 +85,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -116,7 +116,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -158,12 +162,13 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8); + let u_lo_vec = endian::load_endian_u16x32::(u.as_ptr().add(x) as *const u8); + let u_hi_vec = endian::load_endian_u16x32::(u.as_ptr().add(x + 32) as *const u8); + let v_lo_vec = endian::load_endian_u16x32::(v.as_ptr().add(x) as *const u8); + let v_hi_vec = endian::load_endian_u16x32::(v.as_ptr().add(x + 32) as *const u8); let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias16_v); @@ -251,8 +256,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit u8. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm512_srli_epi16::<8>(_mm512_loadu_si512(a_ptr.add(x).cast())); - let a_hi = _mm512_srli_epi16::<8>(_mm512_loadu_si512(a_ptr.add(x + 32).cast())); + let a_lo = + _mm512_srli_epi16::<8>(endian::load_endian_u16x32::(a_ptr.add(x) as *const u8)); + let a_hi = _mm512_srli_epi16::<8>(endian::load_endian_u16x32::( + a_ptr.add(x + 32) as *const u8 + )); narrow_u8x64(a_lo, a_hi, pack_fixup) } else { alpha_u8 @@ -273,15 +281,17 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_row( + scalar::yuv_444p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -301,7 +311,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -312,7 +322,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -328,7 +338,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -339,7 +349,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -359,7 +369,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -371,7 +381,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -404,7 +414,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -451,9 +465,10 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row` first. + let y_vec = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); + let u_vec = endian::load_endian_u16x32::(u.as_ptr().add(x) as *const u8); + let v_vec = endian::load_endian_u16x32::(v.as_ptr().add(x) as *const u8); let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); @@ -541,7 +556,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row(a_ptr.add(x) as *const u8); let a0 = _mm512_extracti32x4_epi32::<0>(a_vec); let a1 = _mm512_extracti32x4_epi32::<1>(a_vec); let a2 = _mm512_extracti32x4_epi32::<2>(a_vec); @@ -594,15 +609,15 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_u16_row( + scalar::yuv_444p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_u16_row( + scalar::yuv_444p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -615,7 +630,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -626,7 +641,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -637,7 +652,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( /// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p16_to_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -648,7 +663,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -669,7 +684,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -681,7 +696,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -704,7 +719,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -749,10 +768,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8); + let u_vec = endian::load_endian_u16x32::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x32::(v_half.as_ptr().add(x / 2) as *const u8); let u_i16 = _mm512_sub_epi16(u_vec, bias16_v); let v_i16 = _mm512_sub_epi16(v_vec, bias16_v); @@ -808,8 +828,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row> 8` to fit u8. // `_mm512_srli_epi16::<8>` accepts a const literal shift. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm512_srli_epi16::<8>(_mm512_loadu_si512(a_ptr.add(x).cast())); - let a_hi = _mm512_srli_epi16::<8>(_mm512_loadu_si512(a_ptr.add(x + 32).cast())); + let a_lo = + _mm512_srli_epi16::<8>(endian::load_endian_u16x32::(a_ptr.add(x) as *const u8)); + let a_hi = _mm512_srli_epi16::<8>(endian::load_endian_u16x32::( + a_ptr.add(x + 32) as *const u8 + )); narrow_u8x64(a_lo, a_hi, pack_fixup) } else { alpha_u8 @@ -830,15 +853,17 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_row( + scalar::yuv_420p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -861,7 +886,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -871,7 +896,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -885,7 +910,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -895,7 +920,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -915,7 +940,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -927,7 +952,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -959,7 +984,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -1015,16 +1044,36 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row` for Y and via + // inline `_mm256_shuffle_epi8` for the half-width U/V loads. + let y_vec = endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8); let u_vec = _mm256_loadu_si256(u_half.as_ptr().add(x / 2).cast()); let v_vec = _mm256_loadu_si256(v_half.as_ptr().add(x / 2).cast()); + let u_vec = if BE { + _mm256_shuffle_epi8(u_vec, bswap_u16_256) + } else { + u_vec + }; + let v_vec = if BE { + _mm256_shuffle_epi8(v_vec, bswap_u16_256) + } else { + v_vec + }; // Center UV by subtracting 32768 (wrapping i16 sub). Using // `_mm256_sub_epi16` with bias16 (which carries -32768 as i16): @@ -1112,7 +1161,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row(a_ptr.add(x) as *const u8); let a0 = _mm512_extracti32x4_epi32::<0>(a_vec); let a1 = _mm512_extracti32x4_epi32::<1>(a_vec); let a2 = _mm512_extracti32x4_epi32::<2>(a_vec); @@ -1165,15 +1214,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_u16_row( + scalar::yuv_420p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_u16_row( + scalar::yuv_420p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_avx512/yuv_planar_high_bit.rs b/src/row/arch/x86_avx512/yuv_planar_high_bit.rs index 4346e474..1bcae07b 100644 --- a/src/row/arch/x86_avx512/yuv_planar_high_bit.rs +++ b/src/row/arch/x86_avx512/yuv_planar_high_bit.rs @@ -31,7 +31,7 @@ use super::*; /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -42,7 +42,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -53,7 +53,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( /// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_row`] with `ALPHA = true`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -64,7 +64,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -84,7 +84,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -96,7 +96,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -132,6 +132,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -184,14 +185,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< while x + 64 <= width { // AND‑mask every load to the low 10 bits — see matching // comment in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]. - let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); + // BE input is byte-swapped via `load_endian_u16x32::` first. + let y_low_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + mask_v, + ); let u_vec = _mm512_and_si512( - _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x32::(u_half.as_ptr().add(x / 2) as *const u8), mask_v, ); let v_vec = _mm512_and_si512( - _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x32::(v_half.as_ptr().add(x / 2) as *const u8), mask_v, ); @@ -254,8 +262,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // `BITS - 8`. Reuse `narrow_u8x64` so the alpha bytes line // up with R/G/B in `write_rgba_64`. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v); + let a_lo = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x + 32) as *const u8), + mask_v, + ); let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); let a_lo_shifted = _mm512_srl_epi16(a_lo, a_shr); let a_hi_shifted = _mm512_srl_epi16(a_hi, a_shr); @@ -280,15 +294,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_row::( + scalar::yuv_420p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_row::( + scalar::yuv_420p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -316,7 +330,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -326,7 +340,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -341,7 +355,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( /// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -351,7 +365,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -371,7 +385,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -383,7 +397,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -420,6 +434,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -474,15 +489,22 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 64 <= width { // AND‑mask loads to the low 10 bits so `chroma_i16x32`'s - // `_mm512_packs_epi32` narrow stays lossless. - let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); + // `_mm512_packs_epi32` narrow stays lossless. BE input is + // byte-swapped via `load_endian_u16x32::` first. + let y_low_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + mask_v, + ); let u_vec = _mm512_and_si512( - _mm512_loadu_si512(u_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x32::(u_half.as_ptr().add(x / 2) as *const u8), mask_v, ); let v_vec = _mm512_and_si512( - _mm512_loadu_si512(v_half.as_ptr().add(x / 2).cast()), + endian::load_endian_u16x32::(v_half.as_ptr().add(x / 2) as *const u8), mask_v, ); @@ -542,8 +564,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // bit depth, no shift; just split each 512-bit load into // four 128-bit quarters via `_mm512_extracti32x4_epi32`. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v); + let a_lo = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x + 32) as *const u8), + mask_v, + ); ( _mm512_extracti32x4_epi32::<0>(a_lo), _mm512_extracti32x4_epi32::<1>(a_lo), @@ -592,15 +620,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_u16_row::( + scalar::yuv_420p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -620,7 +648,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -631,7 +659,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -649,7 +677,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -660,7 +688,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -681,7 +709,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -693,7 +721,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -729,6 +757,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -777,13 +806,32 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 64 <= width { // 64 Y + 64 U + 64 V per iter. Full-width chroma (two 512-bit - // loads each) — no horizontal duplication, 4:4:4 is 1:1. - let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); - let u_lo_vec = _mm512_and_si512(_mm512_loadu_si512(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm512_and_si512(_mm512_loadu_si512(u.as_ptr().add(x + 32).cast()), mask_v); - let v_lo_vec = _mm512_and_si512(_mm512_loadu_si512(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm512_and_si512(_mm512_loadu_si512(v.as_ptr().add(x + 32).cast()), mask_v); + // loads each) — no horizontal duplication, 4:4:4 is 1:1. BE + // input is byte-swapped via `load_endian_u16x32::` first. + let y_low_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + mask_v, + ); + let u_lo_vec = _mm512_and_si512( + endian::load_endian_u16x32::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm512_and_si512( + endian::load_endian_u16x32::(u.as_ptr().add(x + 32) as *const u8), + mask_v, + ); + let v_lo_vec = _mm512_and_si512( + endian::load_endian_u16x32::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm512_and_si512( + endian::load_endian_u16x32::(v.as_ptr().add(x + 32) as *const u8), + mask_v, + ); let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias_v); @@ -871,8 +919,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v); + let a_lo = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x + 32) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. AVX-512's // `_mm512_srli_epi16::` requires a literal shift, so @@ -903,15 +957,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_row::( + scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_row::( + scalar::yuv_444p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -930,7 +984,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -941,7 +995,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -958,7 +1012,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -969,7 +1023,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -990,7 +1044,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "avx512f,avx512bw")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -1002,7 +1056,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -1041,6 +1095,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -1091,12 +1146,31 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 64 <= width { - let y_low_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm512_and_si512(_mm512_loadu_si512(y.as_ptr().add(x + 32).cast()), mask_v); - let u_lo_vec = _mm512_and_si512(_mm512_loadu_si512(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm512_and_si512(_mm512_loadu_si512(u.as_ptr().add(x + 32).cast()), mask_v); - let v_lo_vec = _mm512_and_si512(_mm512_loadu_si512(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm512_and_si512(_mm512_loadu_si512(v.as_ptr().add(x + 32).cast()), mask_v); + // BE input is byte-swapped via `load_endian_u16x32::` first. + let y_low_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm512_and_si512( + endian::load_endian_u16x32::(y.as_ptr().add(x + 32) as *const u8), + mask_v, + ); + let u_lo_vec = _mm512_and_si512( + endian::load_endian_u16x32::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm512_and_si512( + endian::load_endian_u16x32::(u.as_ptr().add(x + 32) as *const u8), + mask_v, + ); + let v_lo_vec = _mm512_and_si512( + endian::load_endian_u16x32::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm512_and_si512( + endian::load_endian_u16x32::(v.as_ptr().add(x + 32) as *const u8), + mask_v, + ); let u_lo_i16 = _mm512_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm512_sub_epi16(u_hi_vec, bias_v); @@ -1188,8 +1262,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert above. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo_v = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x).cast()), mask_v); - let a_hi_v = _mm512_and_si512(_mm512_loadu_si512(a_ptr.add(x + 32).cast()), mask_v); + let a_lo_v = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi_v = _mm512_and_si512( + endian::load_endian_u16x32::(a_ptr.add(x + 32) as *const u8), + mask_v, + ); ( _mm512_extracti32x4_epi32::<0>(a_lo_v), _mm512_extracti32x4_epi32::<1>(a_lo_v), @@ -1238,15 +1318,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_u16_row::( + scalar::yuv_444p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs index 3def1a88..38a914d2 100644 --- a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs @@ -2,6 +2,26 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16` (or other UV-interleaved loads) to +/// apply the BE byte-swap that `load_endian_u16x8` cannot perform for +/// shuffled-then-loaded values. When `BE = false` this compiles away +/// entirely. +#[inline(always)] +unsafe fn byteswap_u16x8(v: __m128i) -> __m128i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 16], __m128i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm_shuffle_epi8(v, mask) } + } else { + v + } +} + /// SSE4.1 high‑bit‑packed semi‑planar (`BITS` ∈ {10, 12}) → packed /// **8‑bit** RGB. /// @@ -30,7 +50,7 @@ use super::*; /// Thin wrapper over [`p_n_to_rgb_or_rgba_row`] with `ALPHA = false`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgb_row( +pub(crate) unsafe fn p_n_to_rgb_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -40,7 +60,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -54,7 +74,7 @@ pub(crate) unsafe fn p_n_to_rgb_row( /// Same as [`p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgba_row( +pub(crate) unsafe fn p_n_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -64,7 +84,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -81,7 +101,7 @@ pub(crate) unsafe fn p_n_to_rgba_row( /// 4. `BITS` ∈ `{10, 12}` — P016 has its own kernel family. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -122,13 +142,26 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - // Y: two u16×8 loads, each shifted right by `16 - BITS`. - let y_low_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x).cast()), shr_count); - let y_high_i16 = _mm_srl_epi16(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), shr_count); + // Y: two u16×8 loads, each shifted right by `16 - BITS`. BE + // input is byte-swapped before the shift via `load_endian_u16x8`. + let y_low_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr_count, + ); // UV: two u16×8 loads of interleaved [U0,V0,U1,V1,...], then - // deinterleave into separate u_vec + v_vec. + // deinterleave into separate u_vec + v_vec. For BE input, + // byte-swap each lane after deinterleave (the shuffle that + // splits U/V can't be combined with the byte-swap shuffle in a + // single pshufb). The shift to extract the BITS-bit value runs + // on host-native u16, so it must follow the byte-swap. let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_vec = _mm_srl_epi16(u_vec, shr_count); let v_vec = _mm_srl_epi16(v_vec, shr_count); @@ -185,9 +218,9 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -210,7 +243,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_row( /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgb_u16_row( +pub(crate) unsafe fn p_n_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -219,7 +252,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -232,7 +265,7 @@ pub(crate) unsafe fn p_n_to_rgb_u16_row( /// Same as [`p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -241,7 +274,7 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p_n_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -260,7 +293,11 @@ pub(crate) unsafe fn p_n_to_rgba_u16_row( /// 4. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -303,9 +340,19 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row` after deinterleave for UV. + let y_low_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr_count, + ); let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_vec = _mm_srl_epi16(u_vec, shr_count); let v_vec = _mm_srl_epi16(v_vec, shr_count); @@ -366,9 +413,13 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -387,7 +438,7 @@ pub(crate) unsafe fn p_n_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -397,7 +448,7 @@ pub(crate) unsafe fn p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -410,7 +461,7 @@ pub(crate) unsafe fn p16_to_rgb_row( /// Same as [`p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p16_to_rgba_row( +pub(crate) unsafe fn p16_to_rgba_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -420,7 +471,7 @@ pub(crate) unsafe fn p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -436,7 +487,7 @@ pub(crate) unsafe fn p16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_row( y: &[u16], uv_half: &[u16], out: &mut [u8], @@ -470,9 +521,13 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_low = _mm_loadu_si128(y.as_ptr().add(x).cast()); - let y_high = _mm_loadu_si128(y.as_ptr().add(x + 8).cast()); + // BE input is byte-swapped via `load_endian_u16x8` for Y and + // via `byteswap_u16x8::` after deinterleave for UV. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16(uv_half.as_ptr().add(x)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_i16 = _mm_sub_epi16(u_vec, bias16_v); let v_i16 = _mm_sub_epi16(v_vec, bias16_v); @@ -526,9 +581,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -541,7 +596,7 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_row( /// Same as [`p16_to_rgb_row`] but `rgb_out` is `&mut [u16]`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p16_to_rgb_u16_row( +pub(crate) unsafe fn p16_to_rgb_u16_row( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -550,7 +605,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } } @@ -562,7 +617,7 @@ pub(crate) unsafe fn p16_to_rgb_u16_row( /// Same as [`p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p16_to_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgba_u16_row( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -571,7 +626,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + p16_to_rgb_or_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } } @@ -587,7 +642,7 @@ pub(crate) unsafe fn p16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( y: &[u16], uv_half: &[u16], out: &mut [u16], @@ -621,12 +676,21 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let y_vec = _mm_loadu_si128(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x8` for Y. UV + // is loaded then split via shuffle; per-u16-lane byte-swap is + // folded into the split shuffle when `BE = true`. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); // Load 4 UV pairs = 8 u16 = 16 bytes; deinterleave inline. // uv_half.len() >= width >= x + 8 guarantees 8 u16 readable. let uv_raw = _mm_loadu_si128(uv_half.as_ptr().add(x).cast()); - // [U0,V0,U1,V1,U2,V2,U3,V3] → [U0,U1,U2,U3, V0,V1,V2,V3] - let split_mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + // [U0,V0,U1,V1,U2,V2,U3,V3] → [U0,U1,U2,U3, V0,V1,V2,V3]. + // For BE: also swap the two bytes within each u16 lane (lo/hi + // byte indices within each 16-bit element flipped). + let split_mask = if BE { + _mm_setr_epi8(1, 0, 5, 4, 9, 8, 13, 12, 3, 2, 7, 6, 11, 10, 15, 14) + } else { + _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15) + }; let uv_split = _mm_shuffle_epi8(uv_raw, split_mask); let u_vec4 = uv_split; let v_vec4 = _mm_srli_si128::<8>(uv_split); @@ -718,9 +782,9 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgba_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p16_to_rgb_u16_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } diff --git a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs index b7f467c7..7bfe9e6c 100644 --- a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs @@ -2,6 +2,24 @@ use core::arch::x86_64::*; use super::*; +/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// +/// Used after `deinterleave_uv_u16` to apply per-lane byte-swapping +/// for BE input. When `BE = false` this compiles away entirely. +#[inline(always)] +unsafe fn byteswap_u16x8(v: __m128i) -> __m128i { + if BE { + let mask = unsafe { + core::mem::transmute::<[u8; 16], __m128i>([ + 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, + ]) + }; + unsafe { _mm_shuffle_epi8(v, mask) } + } else { + v + } +} + // ===== Pn 4:4:4 (semi-planar high-bit-packed) → RGB ======================= // // SSE4.1 kernels for `p_n_444_to_rgb_*` (BITS ∈ {10, 12}) and @@ -22,7 +40,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgb_row( +pub(crate) unsafe fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -32,7 +50,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -46,7 +64,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_row( /// Same as [`p_n_444_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -56,7 +74,7 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -73,7 +91,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_row( /// 3. `BITS` must be one of `{10, 12}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -109,12 +131,27 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row` after deinterleave for UV (the + // deinterleave shuffle and per-lane byte-swap don't compose into + // one pshufb without different masks per BE/LE — keeping a + // separate post-step is simpler and the BE path is rare). + let y_low_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr_count, + ); // Two deinterleave calls — 32 UV u16 elements (= 16 pairs). let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_vec = _mm_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm_srl_epi16(u_hi_vec, shr_count); @@ -179,9 +216,13 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgba_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_to_rgb_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } @@ -199,7 +240,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_row= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -209,7 +250,9 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgb_out, width, matrix, full_range, + ); } } @@ -224,7 +267,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_u16_row( /// Same as [`p_n_444_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -234,7 +277,9 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_to_rgb_or_rgba_u16_row::( + y, uv_full, rgba_out, width, matrix, full_range, + ); } } @@ -251,7 +296,11 @@ pub(crate) unsafe fn p_n_444_to_rgba_u16_row( /// 3. `BITS` ∈ `{10, 12}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const BE: bool, +>( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -290,11 +339,23 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row` after deinterleave for UV. + let y_low_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + shr_count, + ); + let y_high_i16 = _mm_srl_epi16( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + shr_count, + ); let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_vec = _mm_srl_epi16(u_lo_vec, shr_count); let v_lo_vec = _mm_srl_epi16(v_lo_vec, shr_count); let u_hi_vec = _mm_srl_epi16(u_hi_vec, shr_count); @@ -363,11 +424,11 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row( + scalar::p_n_444_to_rgba_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } else { - scalar::p_n_444_to_rgb_u16_row::( + scalar::p_n_444_to_rgb_u16_row::( tail_y, tail_uv, tail_out, tail_w, matrix, full_range, ); } @@ -389,7 +450,7 @@ pub(crate) unsafe fn p_n_444_to_rgb_or_rgba_u16_row= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgb_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -399,7 +460,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -414,7 +475,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -424,7 +485,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -439,7 +500,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( y: &[u16], uv_full: &[u16], out: &mut [u8], @@ -472,12 +533,18 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { - let y_low = _mm_loadu_si128(y.as_ptr().add(x).cast()); - let y_high = _mm_loadu_si128(y.as_ptr().add(x + 8).cast()); + // BE input is byte-swapped via `load_endian_u16x8` for Y and via + // `byteswap_u16x8::` after deinterleave for UV. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); // 32 UV elements per iter — two deinterleave calls. let (u_lo_vec, v_lo_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2)); let (u_hi_vec, v_hi_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2 + 16)); + let u_lo_vec = byteswap_u16x8::(u_lo_vec); + let v_lo_vec = byteswap_u16x8::(v_lo_vec); + let u_hi_vec = byteswap_u16x8::(u_hi_vec); + let v_hi_vec = byteswap_u16x8::(v_hi_vec); let u_lo_i16 = _mm_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm_sub_epi16(u_hi_vec, bias16_v); @@ -537,9 +604,9 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } else { - scalar::p_n_444_16_to_rgb_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); } } } @@ -556,7 +623,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_row( /// Same as [`p_n_444_16_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -566,7 +633,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } } @@ -580,7 +647,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_u16_row( /// Same as [`p_n_444_16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -590,7 +657,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + p_n_444_16_to_rgb_or_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } } @@ -605,7 +672,7 @@ pub(crate) unsafe fn p_n_444_16_to_rgba_u16_row( /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( y: &[u16], uv_full: &[u16], out: &mut [u16], @@ -641,8 +708,12 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { // 8 pixels per iter (i64 narrows). 16 UV u16 elements (= 8 pairs). - let y_vec = _mm_loadu_si128(y.as_ptr().add(x).cast()); + // BE input is byte-swapped via `load_endian_u16x8` for Y and via + // `byteswap_u16x8::` after deinterleave for UV. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); let (u_vec, v_vec) = deinterleave_uv_u16(uv_full.as_ptr().add(x * 2)); + let u_vec = byteswap_u16x8::(u_vec); + let v_vec = byteswap_u16x8::(v_vec); let u_i16 = _mm_sub_epi16(u_vec, bias16_v); let v_i16 = _mm_sub_epi16(v_vec, bias16_v); @@ -750,9 +821,13 @@ pub(crate) unsafe fn p_n_444_16_to_rgb_or_rgba_u16_row( let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; if ALPHA { - scalar::p_n_444_16_to_rgba_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } else { - scalar::p_n_444_16_to_rgb_u16_row(tail_y, tail_uv, tail_out, tail_w, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::( + tail_y, tail_uv, tail_out, tail_w, matrix, full_range, + ); } } } diff --git a/src/row/arch/x86_sse41/tests/be_parity.rs b/src/row/arch/x86_sse41/tests/be_parity.rs new file mode 100644 index 00000000..a46ec7b7 --- /dev/null +++ b/src/row/arch/x86_sse41/tests/be_parity.rs @@ -0,0 +1,261 @@ +//! BE parity tests for SSE4.1 high-bit YUV / P-format kernels. +//! +//! Each test takes a randomized LE input buffer, byte-swaps every u16 +//! element to produce a BE-encoded buffer, then asserts that +//! `kernel::(swapped_input)` produces byte-identical output +//! to `kernel::(original_input)`. + +use super::{ + super::*, high_bit_plane_sse41, interleave_uv_sse41, p_n_packed_plane, p010_uv_interleave, + p16_plane, planar_n_plane, +}; + +fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf.iter().map(|x| x.swap_bytes()).collect() +} + +#[test] +fn sse41_yuv_420p10_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = planar_n_plane::<10>(width, 13); + let u = planar_n_plane::<10>(width / 2, 17); + let v = planar_n_plane::<10>(width / 2, 19); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p_n_to_rgb_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_yuv_420p10_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = planar_n_plane::<10>(width, 23); + let u = planar_n_plane::<10>(width / 2, 29); + let v = planar_n_plane::<10>(width / 2, 31); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + yuv_420p_n_to_rgb_u16_row::<10, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_yuv_444p12_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = planar_n_plane::<12>(width, 41); + let u = planar_n_plane::<12>(width, 43); + let v = planar_n_plane::<12>(width, 47); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, true>( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_yuv_420p16_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p16_plane(width, 53); + let u = p16_plane(width / 2, 59); + let v = p16_plane(width / 2, 61); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_yuv_444p16_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p16_plane(width, 67); + let u = p16_plane(width, 71); + let v = p16_plane(width, 73); + let y_be = byteswap_u16_buf(&y); + let u_be = byteswap_u16_buf(&u); + let v_be = byteswap_u16_buf(&v); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_be, + &u_be, + &v_be, + &mut out_be, + width, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_p010_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p_n_packed_plane::<10>(width, 79); + let u_half = p_n_packed_plane::<10>(width / 2, 83); + let v_half = p_n_packed_plane::<10>(width / 2, 89); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_p410_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p_n_packed_plane::<10>(width, 97); + let u_full = high_bit_plane_sse41::<10>(width, 101); + let v_full = high_bit_plane_sse41::<10>(width, 103); + let uv_full = interleave_uv_sse41(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_p016_be_parity_u8() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p16_plane(width, 107); + let u_half = p16_plane(width / 2, 109); + let v_half = p16_plane(width / 2, 113); + let uv_half = p010_uv_interleave(&u_half, &v_half); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_half); + + let mut out_le = std::vec![0u8; width * 3]; + let mut out_be = std::vec![0u8; width * 3]; + unsafe { + p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + +#[test] +fn sse41_p416_be_parity_u16() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + let width = 32; + let y = p16_plane(width, 127); + let u_full = p16_plane(width, 131); + let v_full = p16_plane(width, 137); + let uv_full = interleave_uv_sse41(&u_full, &v_full); + let y_be = byteswap_u16_buf(&y); + let uv_be = byteswap_u16_buf(&uv_full); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} diff --git a/src/row/arch/x86_sse41/tests/high_bit_4_2_0.rs b/src/row/arch/x86_sse41/tests/high_bit_4_2_0.rs index 2f00a965..153f5a89 100644 --- a/src/row/arch/x86_sse41/tests/high_bit_4_2_0.rs +++ b/src/row/arch/x86_sse41/tests/high_bit_4_2_0.rs @@ -105,9 +105,17 @@ fn check_p10_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -133,9 +141,17 @@ fn check_p10_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::<10>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::<10, false>(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } if rgb_scalar != rgb_simd { @@ -219,9 +235,17 @@ fn check_p_n_u8_sse41_equivalence( let v = p_n_plane_sse41::(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -242,9 +266,17 @@ fn check_p_n_u16_sse41_equivalence( let v = p_n_plane_sse41::(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -297,9 +329,9 @@ fn check_p010_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u8 diverges"); } @@ -314,9 +346,9 @@ fn check_p010_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_rang let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::<10>(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::<10, false>(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 P010→u16 diverges"); } @@ -390,9 +422,17 @@ fn check_planar_u8_sse41_equivalence_n( let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -414,9 +454,17 @@ fn check_planar_u16_sse41_equivalence_n( let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p_n_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -438,9 +486,9 @@ fn check_pn_u8_sse41_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u8 diverges"); } @@ -459,9 +507,9 @@ fn check_pn_u16_sse41_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!(rgb_scalar, rgb_simd, "SSE4.1 Pn {BITS}-bit → u16 diverges"); } @@ -531,9 +579,9 @@ fn check_yuv420p16_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_ let v = p16_plane(width / 2, 71); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgb_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -550,9 +598,17 @@ fn check_yuv420p16_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full let v = p16_plane(width / 2, 71); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgb_u16_row(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); + yuv_420p16_to_rgb_u16_row::(&y, &u, &v, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -570,9 +626,9 @@ fn check_p16_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range: let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -590,9 +646,9 @@ fn check_p16_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, full_range let uv = p010_uv_interleave(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, diff --git a/src/row/arch/x86_sse41/tests/high_bit_4_4_4_and_pn.rs b/src/row/arch/x86_sse41/tests/high_bit_4_4_4_and_pn.rs index b078aef9..cc06fac0 100644 --- a/src/row/arch/x86_sse41/tests/high_bit_4_4_4_and_pn.rs +++ b/src/row/arch/x86_sse41/tests/high_bit_4_4_4_and_pn.rs @@ -20,9 +20,17 @@ fn check_planar_u8_sse41_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -41,9 +49,9 @@ fn check_pn_u8_sse41_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -57,9 +65,9 @@ fn check_yuv420p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane(width / 2, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_420p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -74,9 +82,9 @@ fn check_p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_r let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -210,7 +218,7 @@ fn check_planar_u16_sse41_rgba_equivalence_n( let v = planar_n_plane::(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( &y, &u, &v, @@ -220,7 +228,15 @@ fn check_planar_u16_sse41_rgba_equivalence_n( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -239,9 +255,9 @@ fn check_pn_u16_sse41_rgba_equivalence_n( let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -255,9 +271,17 @@ fn check_yuv420p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane(width / 2, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_420p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_420p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -272,9 +296,9 @@ fn check_p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, full_ let uv = p010_uv_interleave(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -408,9 +432,9 @@ fn check_p_n_444_u8_sse41_equivalence( let uv = interleave_uv_sse41(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -432,9 +456,16 @@ fn check_p_n_444_u16_sse41_equivalence( let uv = interleave_uv_sse41(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::( + &y, + &uv, + &mut rgb_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -452,9 +483,9 @@ fn check_p_n_444_16_u8_sse41_equivalence(width: usize, matrix: ColorMatrix, full let uv = interleave_uv_sse41(&u, &v); let mut rgb_scalar = std::vec![0u8; width * 3]; let mut rgb_simd = std::vec![0u8; width * 3]; - scalar::p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -472,9 +503,9 @@ fn check_p_n_444_16_u16_sse41_equivalence(width: usize, matrix: ColorMatrix, ful let uv = interleave_uv_sse41(&u, &v); let mut rgb_scalar = std::vec![0u16; width * 3]; let mut rgb_simd = std::vec![0u16; width * 3]; - scalar::p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgb_u16_row(&y, &uv, &mut rgb_simd, width, matrix, full_range); + p_n_444_16_to_rgb_u16_row::(&y, &uv, &mut rgb_simd, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_simd, @@ -562,9 +593,17 @@ fn check_yuv444p_n_u8_sse41_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgba_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -583,9 +622,9 @@ fn check_pn_444_u8_sse41_rgba_equivalence( let uv = interleave_uv_sse41(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -599,9 +638,9 @@ fn check_yuv444p16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane(width, 71); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); unsafe { - yuv_444p16_to_rgba_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -616,9 +655,9 @@ fn check_p_n_444_16_u8_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, let uv = interleave_uv_sse41(&u, &v); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -726,7 +765,7 @@ fn check_yuv444p16_u8_sse41_rgba_with_alpha_src_equivalence( let a_src = p16_plane(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p16_to_rgba_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -737,7 +776,7 @@ fn check_yuv444p16_u8_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_with_alpha_src_row( + yuv_444p16_to_rgba_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_sse41/tests/mod.rs b/src/row/arch/x86_sse41/tests/mod.rs index f5e14528..03cfd378 100644 --- a/src/row/arch/x86_sse41/tests/mod.rs +++ b/src/row/arch/x86_sse41/tests/mod.rs @@ -1,4 +1,5 @@ mod ayuv64; +mod be_parity; mod endian; mod high_bit_4_2_0; mod high_bit_4_4_4_and_pn; diff --git a/src/row/arch/x86_sse41/tests/planar_8bit_and_nv.rs b/src/row/arch/x86_sse41/tests/planar_8bit_and_nv.rs index 8facc9b3..a898f520 100644 --- a/src/row/arch/x86_sse41/tests/planar_8bit_and_nv.rs +++ b/src/row/arch/x86_sse41/tests/planar_8bit_and_nv.rs @@ -676,11 +676,27 @@ fn check_yuv_444p_n_equivalence( let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_sse41 = std::vec![0u16; width * 3]; - scalar::yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::( + &y, + &u, + &v, + &mut rgb_scalar, + width, + matrix, + full_range, + ); + scalar::yuv_444p_n_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_sse41, width, matrix, full_range); - yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_sse41, width, matrix, full_range); + yuv_444p_n_to_rgb_row::(&y, &u, &v, &mut rgb_sse41, width, matrix, full_range); + yuv_444p_n_to_rgb_u16_row::(&y, &u, &v, &mut u16_sse41, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_sse41, @@ -766,11 +782,19 @@ fn check_yuv_444p16_equivalence(width: usize, matrix: ColorMatrix, full_range: b let mut u16_scalar = std::vec![0u16; width * 3]; let mut u16_sse41 = std::vec![0u16; width * 3]; - scalar::yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); - scalar::yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgb_u16_row::( + &y, + &u, + &v, + &mut u16_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgb_row(&y, &u, &v, &mut rgb_sse41, width, matrix, full_range); - yuv_444p16_to_rgb_u16_row(&y, &u, &v, &mut u16_sse41, width, matrix, full_range); + yuv_444p16_to_rgb_row::(&y, &u, &v, &mut rgb_sse41, width, matrix, full_range); + yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut u16_sse41, width, matrix, full_range); } assert_eq!( rgb_scalar, rgb_sse41, diff --git a/src/row/arch/x86_sse41/tests/yuva.rs b/src/row/arch/x86_sse41/tests/yuva.rs index 6276116b..c35c053b 100644 --- a/src/row/arch/x86_sse41/tests/yuva.rs +++ b/src/row/arch/x86_sse41/tests/yuva.rs @@ -20,7 +20,7 @@ fn check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -31,7 +31,7 @@ fn check_yuv444p_n_u8_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_with_alpha_src_row::( + yuv_444p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -214,7 +214,7 @@ fn check_yuv420p_n_u8_sse41_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -225,7 +225,7 @@ fn check_yuv420p_n_u8_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_with_alpha_src_row::( + yuv_420p_n_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -254,7 +254,7 @@ fn check_yuv420p16_u8_sse41_rgba_with_alpha_src_equivalence( let a_src = p16_plane(width, alpha_seed); let mut rgba_scalar = std::vec![0u8; width * 4]; let mut rgba_simd = std::vec![0u8; width * 4]; - scalar::yuv_420p16_to_rgba_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -265,7 +265,7 @@ fn check_yuv420p16_u8_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_with_alpha_src_row( + yuv_420p16_to_rgba_with_alpha_src_row::( &y, &u, &v, @@ -391,7 +391,7 @@ fn check_yuv444p_n_u16_sse41_rgba_equivalence( let v = planar_n_plane::(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( &y, &u, &v, @@ -401,7 +401,15 @@ fn check_yuv444p_n_u16_sse41_rgba_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p_n_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_simd, + width, + matrix, + full_range, + ); } assert_eq!( rgba_scalar, rgba_simd, @@ -420,9 +428,16 @@ fn check_pn_444_u16_sse41_rgba_equivalence( let uv = interleave_uv_sse41(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_to_rgba_u16_row::( + &y, + &uv, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -436,9 +451,17 @@ fn check_yuv444p16_u16_sse41_rgba_equivalence(width: usize, matrix: ColorMatrix, let v = p16_plane(width, 71); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_scalar, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_u16_row::( + &y, + &u, + &v, + &mut rgba_scalar, + width, + matrix, + full_range, + ); unsafe { - yuv_444p16_to_rgba_u16_row(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); + yuv_444p16_to_rgba_u16_row::(&y, &u, &v, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -457,9 +480,9 @@ fn check_p_n_444_16_u16_sse41_rgba_equivalence( let uv = interleave_uv_sse41(&u, &v); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_scalar, width, matrix, full_range); + scalar::p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_scalar, width, matrix, full_range); unsafe { - p_n_444_16_to_rgba_u16_row(&y, &uv, &mut rgba_simd, width, matrix, full_range); + p_n_444_16_to_rgba_u16_row::(&y, &uv, &mut rgba_simd, width, matrix, full_range); } assert_eq!( rgba_scalar, rgba_simd, @@ -567,7 +590,7 @@ fn check_yuv444p16_u16_sse41_rgba_with_alpha_src_equivalence( let a_src = p16_plane(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -578,7 +601,7 @@ fn check_yuv444p16_u16_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p16_to_rgba_u16_with_alpha_src_row( + yuv_444p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -670,7 +693,7 @@ fn check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -681,7 +704,7 @@ fn check_yuv444p_n_u16_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + yuv_444p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -822,7 +845,7 @@ fn check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence( let a_src = planar_n_plane::(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -833,7 +856,7 @@ fn check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -862,7 +885,7 @@ fn check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence( let a_src = p16_plane(width, alpha_seed); let mut rgba_scalar = std::vec![0u16; width * 4]; let mut rgba_simd = std::vec![0u16; width * 4]; - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, @@ -873,7 +896,7 @@ fn check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence( full_range, ); unsafe { - yuv_420p16_to_rgba_u16_with_alpha_src_row( + yuv_420p16_to_rgba_u16_with_alpha_src_row::( &y, &u, &v, diff --git a/src/row/arch/x86_sse41/yuv_planar_16bit.rs b/src/row/arch/x86_sse41/yuv_planar_16bit.rs index 6cd972fb..166402a6 100644 --- a/src/row/arch/x86_sse41/yuv_planar_16bit.rs +++ b/src/row/arch/x86_sse41/yuv_planar_16bit.rs @@ -15,7 +15,7 @@ use super::*; /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p16_to_rgb_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -26,7 +26,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -43,7 +43,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_row( /// Same as [`yuv_444p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p16_to_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -54,7 +54,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -74,7 +74,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -86,7 +86,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_row::( + yuv_444p16_to_rgb_or_rgba_row::( y, u, v, @@ -117,7 +117,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -158,12 +162,13 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_lo_vec = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let u_hi_vec = endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8); + let v_lo_vec = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); + let v_hi_vec = endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8); let u_lo_i16 = _mm_sub_epi16(u_lo_vec, bias16_v); let u_hi_i16 = _mm_sub_epi16(u_hi_vec, bias16_v); @@ -215,8 +220,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row> 8` to fit u8. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm_srli_epi16::<8>(_mm_loadu_si128(a_ptr.add(x).cast())); - let a_hi = _mm_srli_epi16::<8>(_mm_loadu_si128(a_ptr.add(x + 8).cast())); + let a_lo = + _mm_srli_epi16::<8>(endian::load_endian_u16x8::(a_ptr.add(x) as *const u8)); + let a_hi = _mm_srli_epi16::<8>(endian::load_endian_u16x8::( + a_ptr.add(x + 8) as *const u8 + )); _mm_packus_epi16(a_lo, a_hi) } else { alpha_u8 @@ -237,15 +245,17 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_row( + scalar::yuv_444p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_444p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -268,7 +278,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -279,7 +289,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgb_out, width, matrix, full_range, ); } @@ -295,7 +305,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_u16_row( /// Same as [`yuv_444p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -306,7 +316,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, None, rgba_out, width, matrix, full_range, ); } @@ -326,7 +336,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -338,7 +348,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p16_to_rgb_or_rgba_u16_row::( + yuv_444p16_to_rgb_or_rgba_u16_row::( y, u, v, @@ -369,7 +379,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u: &[u16], v: &[u16], @@ -411,10 +425,11 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row` first. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let u_vec = endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8); + let v_vec = endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8); let u_i16 = _mm_sub_epi16(u_vec, bias16_v); let v_i16 = _mm_sub_epi16(v_vec, bias16_v); @@ -519,7 +534,9 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8 + ) } else { alpha_u16 }; @@ -539,15 +556,15 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p16_to_rgba_u16_row( + scalar::yuv_444p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p16_to_rgb_u16_row( + scalar::yuv_444p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -575,7 +592,7 @@ pub(crate) unsafe fn yuv_444p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -586,7 +603,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -601,7 +618,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_row( /// Same as [`yuv_420p16_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p16_to_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -612,7 +629,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -632,7 +649,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -644,7 +661,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_row::( + yuv_420p16_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -677,7 +694,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_with_alpha_src_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -720,10 +741,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row` first. + let y_low = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); + let y_high = endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8); + let u_vec = endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8); + let v_vec = endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8); // Center UV: subtract 32768 (wrapping i16 trick). let u_i16 = _mm_sub_epi16(u_vec, bias16_v); @@ -773,8 +795,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row` accepts a const literal // shift, so the intrinsic is well-formed. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm_srli_epi16::<8>(_mm_loadu_si128(a_ptr.add(x).cast())); - let a_hi = _mm_srli_epi16::<8>(_mm_loadu_si128(a_ptr.add(x + 8).cast())); + let a_lo = + _mm_srli_epi16::<8>(endian::load_endian_u16x8::(a_ptr.add(x) as *const u8)); + let a_hi = _mm_srli_epi16::<8>(endian::load_endian_u16x8::( + a_ptr.add(x + 8) as *const u8 + )); _mm_packus_epi16(a_lo, a_hi) } else { alpha_u8 @@ -795,15 +820,17 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_row( + scalar::yuv_420p16_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_row(tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range); + scalar::yuv_420p16_to_rgb_row::( + tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, + ); } } } @@ -820,7 +847,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -830,7 +857,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -844,7 +871,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( /// Same as [`yuv_420p16_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -854,7 +881,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -873,7 +900,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -885,7 +912,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( + yuv_420p16_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -917,7 +944,11 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -957,13 +988,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row`; half-lane U/V via inline shuffle. + let y_vec = endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8); // Load 4 U and 4 V u16 values into the low 64 bits of each vector. let u_vec4 = _mm_loadl_epi64(u_half.as_ptr().add(x / 2).cast()); let v_vec4 = _mm_loadl_epi64(v_half.as_ptr().add(x / 2).cast()); + let u_vec4 = if BE { + _mm_shuffle_epi8(u_vec4, bswap_u16) + } else { + u_vec4 + }; + let v_vec4 = if BE { + _mm_shuffle_epi8(v_vec4, bswap_u16) + } else { + v_vec4 + }; // Center UV: subtract 32768 (wrapping i16 trick). let u_i16 = _mm_sub_epi16(u_vec4, bias16_v); @@ -1060,7 +1110,9 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( + a_src.as_ref().unwrap_unchecked().as_ptr().add(x) as *const u8 + ) } else { alpha_u16 }; @@ -1086,15 +1138,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p16_to_rgba_u16_row( + scalar::yuv_420p16_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p16_to_rgb_u16_row( + scalar::yuv_420p16_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/arch/x86_sse41/yuv_planar_high_bit.rs b/src/row/arch/x86_sse41/yuv_planar_high_bit.rs index ef0db9bb..a840a08c 100644 --- a/src/row/arch/x86_sse41/yuv_planar_high_bit.rs +++ b/src/row/arch/x86_sse41/yuv_planar_high_bit.rs @@ -2,7 +2,7 @@ use core::arch::x86_64::*; use super::*; -pub(crate) unsafe fn yuv_420p_n_to_rgb_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -13,7 +13,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -28,7 +28,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_row( /// Same as [`yuv_420p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -39,7 +39,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -59,7 +59,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -71,7 +71,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_row::( + yuv_420p_n_to_rgb_or_rgba_row::( y, u_half, v_half, @@ -110,6 +110,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -162,11 +163,24 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // 16 Y = two `u16x8` loads; 8 U + 8 V = one load each. Each // load is AND‑masked to the low 10 bits (see matching comment // in [`crate::row::scalar::yuv_420p_n_to_rgb_row`]). Valid - // 10‑bit samples ≤ 1023 pass through unchanged. - let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); - let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v); - let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v); + // 10‑bit samples ≤ 1023 pass through unchanged. BE input is + // byte-swapped by `load_endian_u16x8::` before masking. + let y_low_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = _mm_and_si128( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = _mm_and_si128( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -216,8 +230,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< // shift (not stable for `BITS - 8`); use `_mm_srl_epi16` // with a count vector built from `BITS - 8` instead. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v); + let a_lo = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); let a_shr = _mm_cvtsi32_si128((BITS - 8) as i32); let a_lo_shifted = _mm_srl_epi16(a_lo, a_shr); let a_hi_shifted = _mm_srl_epi16(a_hi, a_shr); @@ -242,15 +262,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_row::( + scalar::yuv_420p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_row::( + scalar::yuv_420p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -280,7 +300,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_row< /// `v_half.len() >= width / 2`, `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -290,7 +310,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } @@ -305,7 +325,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( /// Same as [`yuv_420p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -315,7 +335,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } @@ -335,7 +355,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -347,7 +367,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( + yuv_420p_n_to_rgb_or_rgba_u16_row::( y, u_half, v_half, @@ -384,6 +404,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u_half: &[u16], @@ -438,11 +459,24 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // (32768 for 10→10 full range) would let an out‑of‑range // sample push a `coeff * v_d` product past i16 range, // triggering information loss in the subsequent - // `_mm_packs_epi32` narrow step inside `chroma_i16x8`. - let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); - let u_vec = _mm_and_si128(_mm_loadu_si128(u_half.as_ptr().add(x / 2).cast()), mask_v); - let v_vec = _mm_and_si128(_mm_loadu_si128(v_half.as_ptr().add(x / 2).cast()), mask_v); + // `_mm_packs_epi32` narrow step inside `chroma_i16x8`. BE + // input is byte-swapped by `load_endian_u16x8::` first. + let y_low_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_vec = _mm_and_si128( + endian::load_endian_u16x8::(u_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); + let v_vec = _mm_and_si128( + endian::load_endian_u16x8::(v_half.as_ptr().add(x / 2) as *const u8), + mask_v, + ); let u_i16 = _mm_sub_epi16(u_vec, bias_v); let v_i16 = _mm_sub_epi16(v_vec, bias_v); @@ -487,8 +521,14 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< // No depth conversion — both source alpha and u16 output are // at the same native bit depth (BITS), so just mask. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v); - let hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v); + let lo = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -512,15 +552,15 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_420p_n_to_rgba_u16_row::( + scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_420p_n_to_rgb_u16_row::( + scalar::yuv_420p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -551,7 +591,7 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< /// `rgb_out.len() >= 3 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -562,7 +602,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -580,7 +620,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_row( /// Same as [`yuv_444p_n_to_rgb_row`] but `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -591,7 +631,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -611,7 +651,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -623,7 +663,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_row::( + yuv_444p_n_to_rgb_or_rgba_row::( y, u, v, @@ -659,6 +699,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -705,13 +746,32 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< let mut x = 0usize; while x + 16 <= width { // 16 Y + 16 U + 16 V per iter. Full-width chroma load (two - // u16x8 each) — no horizontal duplication needed. - let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); - let u_lo_vec = _mm_and_si128(_mm_loadu_si128(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm_and_si128(_mm_loadu_si128(u.as_ptr().add(x + 8).cast()), mask_v); - let v_lo_vec = _mm_and_si128(_mm_loadu_si128(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm_and_si128(_mm_loadu_si128(v.as_ptr().add(x + 8).cast()), mask_v); + // u16x8 each) — no horizontal duplication needed. BE input is + // byte-swapped by `load_endian_u16x8::` first. + let y_low_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_vec = _mm_and_si128( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm_and_si128( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_vec = _mm_and_si128( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm_and_si128( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let u_lo_i16 = _mm_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm_sub_epi16(u_hi_vec, bias_v); @@ -764,8 +824,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< // SAFETY (const-checked): ALPHA_SRC = true implies the // wrapper passed Some(_), validated by debug_assert. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let a_lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v); - let a_hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v); + let a_lo = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let a_hi = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); // Mask before shifting to harden against over-range source // alpha (e.g. 1024 at BITS=10), matching scalar. SSE4.1 // `_mm_srli_epi16::` requires a literal const generic @@ -795,15 +861,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_row::( + scalar::yuv_444p_n_to_rgba_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_row::( + scalar::yuv_444p_n_to_rgb_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } @@ -822,7 +888,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_row< /// Same as [`yuv_444p_n_to_rgb_row`] but `rgb_out: &mut [u16]`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -833,7 +899,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgb_out, width, matrix, full_range, None, ); } @@ -851,7 +917,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_u16_row( /// Same as [`yuv_444p_n_to_rgb_u16_row`] plus `rgba_out.len() >= 4 * width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -862,7 +928,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, rgba_out, width, matrix, full_range, None, ); } @@ -883,7 +949,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_row( #[inline] #[target_feature(enable = "sse4.1")] #[allow(clippy::too_many_arguments)] -pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( +pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( y: &[u16], u: &[u16], v: &[u16], @@ -895,7 +961,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgba_u16_with_alpha_src_row( ) { // SAFETY: caller obligations forwarded to the shared impl. unsafe { - yuv_444p_n_to_rgb_or_rgba_u16_row::( + yuv_444p_n_to_rgb_or_rgba_u16_row::( y, u, v, @@ -934,6 +1000,7 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< const BITS: u32, const ALPHA: bool, const ALPHA_SRC: bool, + const BE: bool, >( y: &[u16], u: &[u16], @@ -985,12 +1052,31 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< let mut x = 0usize; while x + 16 <= width { - let y_low_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x).cast()), mask_v); - let y_high_i16 = _mm_and_si128(_mm_loadu_si128(y.as_ptr().add(x + 8).cast()), mask_v); - let u_lo_vec = _mm_and_si128(_mm_loadu_si128(u.as_ptr().add(x).cast()), mask_v); - let u_hi_vec = _mm_and_si128(_mm_loadu_si128(u.as_ptr().add(x + 8).cast()), mask_v); - let v_lo_vec = _mm_and_si128(_mm_loadu_si128(v.as_ptr().add(x).cast()), mask_v); - let v_hi_vec = _mm_and_si128(_mm_loadu_si128(v.as_ptr().add(x + 8).cast()), mask_v); + // BE input is byte-swapped by `load_endian_u16x8::` first. + let y_low_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x) as *const u8), + mask_v, + ); + let y_high_i16 = _mm_and_si128( + endian::load_endian_u16x8::(y.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let u_lo_vec = _mm_and_si128( + endian::load_endian_u16x8::(u.as_ptr().add(x) as *const u8), + mask_v, + ); + let u_hi_vec = _mm_and_si128( + endian::load_endian_u16x8::(u.as_ptr().add(x + 8) as *const u8), + mask_v, + ); + let v_lo_vec = _mm_and_si128( + endian::load_endian_u16x8::(v.as_ptr().add(x) as *const u8), + mask_v, + ); + let v_hi_vec = _mm_and_si128( + endian::load_endian_u16x8::(v.as_ptr().add(x + 8) as *const u8), + mask_v, + ); let u_lo_i16 = _mm_sub_epi16(u_lo_vec, bias_v); let u_hi_i16 = _mm_sub_epi16(u_hi_vec, bias_v); @@ -1040,8 +1126,14 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< // at the same native bit depth (BITS), so just AND-mask any // over-range bits to match the scalar reference. let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); - let lo = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x).cast()), mask_v); - let hi = _mm_and_si128(_mm_loadu_si128(a_ptr.add(x + 8).cast()), mask_v); + let lo = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x) as *const u8), + mask_v, + ); + let hi = _mm_and_si128( + endian::load_endian_u16x8::(a_ptr.add(x + 8) as *const u8), + mask_v, + ); (lo, hi) } else { (alpha_u16, alpha_u16) @@ -1065,15 +1157,15 @@ pub(crate) unsafe fn yuv_444p_n_to_rgb_or_rgba_u16_row< if ALPHA_SRC { // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::( tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, ); } else if ALPHA { - scalar::yuv_444p_n_to_rgba_u16_row::( + scalar::yuv_444p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } else { - scalar::yuv_444p_n_to_rgb_u16_row::( + scalar::yuv_444p_n_to_rgb_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); } diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 103c603e..8088e8c9 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -198,7 +198,9 @@ pub fn yuv420p10_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } /// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** @@ -284,7 +286,9 @@ pub fn yuv420p10_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } /// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** @@ -368,5 +372,7 @@ pub fn yuv420p10_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index 2e0dec5c..bf84f8ab 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -174,7 +174,9 @@ pub fn yuv420p12_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } /// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** @@ -260,7 +262,9 @@ pub fn yuv420p12_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } /// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** @@ -344,5 +348,7 @@ pub fn yuv420p12_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index 61f6e59a..e98c61ac 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -163,7 +163,9 @@ pub fn yuv420p14_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } /// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** @@ -249,7 +251,9 @@ pub fn yuv420p14_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } /// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** @@ -333,5 +337,7 @@ pub fn yuv420p14_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index 75690ee5..1f4dfa9e 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -292,5 +292,7 @@ pub fn yuv420p16_to_rgba_u16_row( } } - scalar::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p16_to_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index d4867753..3d770678 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -185,7 +185,9 @@ pub fn yuv420p9_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgb_u16_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } // ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- @@ -278,7 +280,9 @@ pub fn yuv420p9_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } /// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** @@ -362,5 +366,7 @@ pub fn yuv420p9_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); + scalar::yuv_420p_n_to_rgba_u16_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } diff --git a/src/row/dispatch/yuva/sub_4_4_4.rs b/src/row/dispatch/yuva/sub_4_4_4.rs index 4eb17490..8edce6fc 100644 --- a/src/row/dispatch/yuva/sub_4_4_4.rs +++ b/src/row/dispatch/yuva/sub_4_4_4.rs @@ -924,7 +924,9 @@ pub fn yuva444p16_to_rgba_row( } } - scalar::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( + y, u, v, a, rgba_out, width, matrix, full_range, + ); } /// Converts one row of **16-bit** YUVA 4:4:4 to **native-depth `u16`** diff --git a/src/row/scalar/yuv_planar_16bit.rs b/src/row/scalar/yuv_planar_16bit.rs index c49db3b2..fa478d24 100644 --- a/src/row/scalar/yuv_planar_16bit.rs +++ b/src/row/scalar/yuv_planar_16bit.rs @@ -127,7 +127,11 @@ pub(crate) fn yuv_420p16_to_rgba_with_alpha_src_row( /// u16 is in range. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_420p16_to_rgb_or_rgba_row( +pub(crate) fn yuv_420p16_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -383,7 +387,9 @@ pub(crate) fn yuv_444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgb_out, width, matrix, full_range); + yuv_444p16_to_rgb_or_rgba_row::( + y, u, v, None, rgb_out, width, matrix, full_range, + ); } /// YUV 4:4:4 planar **16‑bit** → packed **8‑bit** **RGBA**. Same @@ -402,7 +408,9 @@ pub(crate) fn yuv_444p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, ) { - yuv_444p16_to_rgb_or_rgba_row::(y, u, v, None, rgba_out, width, matrix, full_range); + yuv_444p16_to_rgb_or_rgba_row::( + y, u, v, None, rgba_out, width, matrix, full_range, + ); } /// YUVA 4:4:4 16‑bit → packed **8‑bit** **RGBA**. Same numerical diff --git a/src/row/scalar/yuv_planar_high_bit.rs b/src/row/scalar/yuv_planar_high_bit.rs index 3816d91d..ecb10968 100644 --- a/src/row/scalar/yuv_planar_high_bit.rs +++ b/src/row/scalar/yuv_planar_high_bit.rs @@ -184,8 +184,14 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_row< let mut x = 0; while x < width { let c_idx = x / 2; - let u_d = q15_scale((load_u16::(u_half[c_idx]) & mask) as i32 - bias, c_scale); - let v_d = q15_scale((load_u16::(v_half[c_idx]) & mask) as i32 - bias, c_scale); + let u_d = q15_scale( + (load_u16::(u_half[c_idx]) & mask) as i32 - bias, + c_scale, + ); + let v_d = q15_scale( + (load_u16::(v_half[c_idx]) & mask) as i32 - bias, + c_scale, + ); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); @@ -401,8 +407,14 @@ pub(crate) fn yuv_420p_n_to_rgb_or_rgba_u16_row< let mut x = 0; while x < width { let c_idx = x / 2; - let u_d = q15_scale((load_u16::(u_half[c_idx]) & mask) as i32 - bias, c_scale); - let v_d = q15_scale((load_u16::(v_half[c_idx]) & mask) as i32 - bias, c_scale); + let u_d = q15_scale( + (load_u16::(u_half[c_idx]) & mask) as i32 - bias, + c_scale, + ); + let v_d = q15_scale( + (load_u16::(v_half[c_idx]) & mask) as i32 - bias, + c_scale, + ); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); let g_chroma = q15_chroma(coeffs.g_u(), u_d, coeffs.g_v(), v_d); From 3683eeb6a57396ed5c7cd2a9fbdcb01ad648eed8 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Fri, 8 May 2026 02:12:48 +1200 Subject: [PATCH 3/8] fix(be-yuv-hb): make scalar BE conversion target-endian aware MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit YUV-HB scalar reads route through a centralized helper `load_u16` in `src/row/scalar/mod.rs`. The original implementation used the naive `if BE { v.swap_bytes() } else { v }` pattern which is wrong on big-endian hosts (s390x): it unconditionally swaps when BE=true regardless of host endianness, diverging from the SIMD `load_endian_u16x*::` helpers which are target-endian aware (a swap is needed only when source byte order differs from host CPU's native byte order). Replaced with `if BE { u16::from_be(v) } else { u16::from_le(v) }`. `u16::from_be`/`from_le` each emit a `bswap` only when the source byte order differs from the host — exactly matching the SIMD helper semantics. All 9/10/12/14/16-bit YUV planar + P010/012/016/410/412/ 416 kernels go through this single helper, so this fix corrects every YUV-HB BE scalar path crate-wide in one commit. Verified: - cargo test --target aarch64-apple-darwin --lib: 2168 passed - cargo build --target x86_64-apple-darwin --tests: 0 warnings - cargo build --target wasm32-unknown-unknown --tests (RUSTFLAGS=-C target-feature=+simd128): clean - cargo build --no-default-features: clean - cargo fmt --check: clean - cargo clippy --all-targets --all-features -D warnings: clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/scalar/mod.rs | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/row/scalar/mod.rs b/src/row/scalar/mod.rs index 4212e48b..3c4d8635 100644 --- a/src/row/scalar/mod.rs +++ b/src/row/scalar/mod.rs @@ -181,15 +181,25 @@ pub(super) fn clamp_u8(v: i32) -> u8 { v.clamp(0, 255) as u8 } -/// Byte-swap a `u16` sample when `BE = true`; identity when `BE = false`. +/// Normalize a `u16` sample (just read host-native from memory) to the +/// host-native interpretation of the source byte order indicated by `BE`. +/// `BE = false` → little-endian source; `BE = true` → big-endian source. +/// The `if BE` branch is dead-code-eliminated per monomorphization, so +/// the matching-endian path is a zero-overhead no-op. /// -/// Used by BE-aware scalar kernels to normalize big-endian `u16` plane -/// elements to host-native order at load time. The `if BE` branch is -/// dead-code-eliminated by the compiler for each monomorphization, so -/// the LE path (`BE = false`) is a zero-overhead no-op. +/// **Target-endian aware** — matches the SIMD `load_endian_u16x*::` +/// helpers' semantics: `u16::from_be` / `u16::from_le` each emit a +/// `bswap` only when the source byte order differs from the host CPU's +/// native order. On a BE host the `BE = true` branch is a plain pass- +/// through (no swap) and the `BE = false` branch swaps; on an LE host +/// the polarity reverses. This is the strict-superset-of-bugs +/// alternative to a naive `if BE { v.swap_bytes() }` pattern, which +/// would corrupt rows on s390x / other BE hosts. See +/// `fix(be-tier10b): make scalar BE conversion target-endian aware` +/// for the codex finding that motivated this contract crate-wide. #[cfg_attr(not(tarpaulin), inline(always))] pub(super) const fn load_u16(v: u16) -> u16 { - if BE { v.swap_bytes() } else { v } + if BE { u16::from_be(v) } else { u16::from_le(v) } } /// `(sample * scale_q15 + RND) >> 15`. With input masked to BITS, From cbedaf103383e40d21247896ae786cd52f16cf54 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 13:33:13 +1200 Subject: [PATCH 4/8] fix(be-yuv-hb): NEON P-format chroma deinterleave host-endian gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review on PR #89 flagged two high-severity findings in the NEON high-bit-packed semi-planar (P-format) chroma path: src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs:20-25 BE UV deinterleave swaps on wire endian instead of host mismatch. src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs:30-35 Same endian inversion in the 4:4:4 P-format chroma path. `vld2q_u16` materializes UV lanes in the **host-native** u16 byte order regardless of the wire encoding, so the per-lane byte-swap must trigger on `BE != HOST_NATIVE_BE`, not on `BE` alone (which assumes an LE host). On a big-endian AArch64 target (`aarch64_be-*`) the previous gate inverted both quadrants: * BE input already matches host order → got byte-reversed. * LE input differs from host order → was left unswapped. The Y path in these files already routes through the target-aware `endian::load_endian_u16x8::` helper, so the bug is isolated to the interleaved UV halves of P010/P012/P016 (4:2:0) and P410/ P412/P416 (4:4:4). Mirrors PR #82 (`9c7d533`), #85 (`9e678b0`), #87 (`b4e0ac9`), and PR #88 (`9ea3e43`) target-endian-gate fixes. The two NEON `deinterleave_endian` helpers now delegate per-lane to the existing shared `bswap_u16x8_if_be` helper in `src/row/arch/neon/mod.rs`, which already encodes the `BE != HOST_NATIVE_BE` truth table. For consistency across backends and to avoid future regression on hypothetical BE-x86 / BE-wasm targets, the same gate change is applied to the x86 SSE4.1 / AVX2 / AVX-512 and wasm_simd128 P-format `byteswap_u16x{8,16,32}` helpers (no behavior change today since `cfg!(target_endian = "big")` is `false` on these targets, the gate folds identically). Two additional non-NEON sites carrying the same bug class were also fixed: * x86_sse41 / x86_avx2 4:2:0: `split_mask` shuffle index (`if BE { swap-fold } else { plain }`). * x86_avx512 4:2:0: `uv_lane_mask` per-lane shuffle index. All gated to `BE != HOST_NATIVE_BE`. Affected files (per backend, per format): * NEON (the codex-named bugs): - src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs - src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs * x86 SSE4.1: - src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs - src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs * x86 AVX2: - src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs - src/row/arch/x86_avx2/subsampled_high_bit_pn_4_4_4.rs * x86 AVX-512: - src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs - src/row/arch/x86_avx512/subsampled_high_bit_pn_4_4_4.rs * wasm simd128: - src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs - src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs Test coverage: * Rewrote `src/row/arch/neon/tests/be_parity.rs` to construct the LE / BE input fixtures from raw bytes via `to_le_bytes` / `to_be_bytes` (then `from_ne_bytes` to reinterpret as host-native u16). Mirrors PR #82 / #85 / #87 / #88's host-independent fixture pattern. The earlier `swap_bytes` pattern was vacuous on BE hosts (both kernels produced equal-but-wrong outputs and the assert passed without exercising the BE-host decode path). * Added 3 new BE parity regression tests covering the u16-output P-format kernels — the additional `vld2q_u16 + deinterleave_endian` sites in 4:2:0 (line 366) and 4:4:4 (line 360-363, 716): - `neon_p012_be_parity_u16` (Pn 4:2:0 → u16 RGB) - `neon_p412_be_parity_u16` (Pn 4:4:4 → u16 RGB) - `neon_p016_be_parity_u16` (P016 4:2:0 → u16 RGB) Verified: * cargo test --target aarch64-apple-darwin --lib (2331 passed) * cargo build --target x86_64-apple-darwin --tests (0 warnings) * RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests * cargo build --no-default-features * cargo fmt --check * cargo clippy --all-targets --all-features -- -D warnings * cargo check --target s390x-unknown-linux-gnu --lib (BE-host smoke) Co-Authored-By: Claude Opus 4.7 (1M context) --- .../arch/neon/subsampled_high_bit_pn_4_2_0.rs | 40 ++- .../arch/neon/subsampled_high_bit_pn_4_4_4.rs | 29 +- src/row/arch/neon/tests/be_parity.rs | 301 ++++++++++++++---- .../subsampled_high_bit_pn_4_2_0.rs | 15 +- .../subsampled_high_bit_pn_4_4_4.rs | 15 +- .../x86_avx2/subsampled_high_bit_pn_4_2_0.rs | 22 +- .../x86_avx2/subsampled_high_bit_pn_4_4_4.rs | 15 +- .../subsampled_high_bit_pn_4_2_0.rs | 23 +- .../subsampled_high_bit_pn_4_4_4.rs | 15 +- .../x86_sse41/subsampled_high_bit_pn_4_2_0.rs | 26 +- .../x86_sse41/subsampled_high_bit_pn_4_4_4.rs | 15 +- 11 files changed, 377 insertions(+), 139 deletions(-) diff --git a/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs index 7381c514..f97702ab 100644 --- a/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/neon/subsampled_high_bit_pn_4_2_0.rs @@ -4,24 +4,32 @@ use crate::{ColorMatrix, row::scalar}; use super::*; -/// Byte-swap every u16 lane in `v` (BE ↔ LE conversion in-register). -/// -/// Equivalent to `vrev16q_u8` on the reinterpreted byte view. Used -/// after `vld2q_u16` to apply per-lane byte-swapping that -/// `load_endian_u16x8` cannot perform for interleaved loads. -#[inline(always)] -unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { - unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } -} - -/// Apply BE byte-swap to a `uint16x8x2_t` pair (each lane individually). -/// When `BE = false` this is a no-op and compiles away entirely. +/// Apply per-lane byte-swap to a `uint16x8x2_t` pair when the source +/// (wire) endian differs from the host's native u16 byte order. +/// +/// `vld2q_u16` materializes lanes in the **host-native** u16 byte order +/// regardless of the wire encoding, so the swap must trigger on +/// `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88 fixes). +/// Truth table: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// `BE` and [`HOST_NATIVE_BE`](super::HOST_NATIVE_BE) are both +/// compile-time constants, so the gate folds and the unused branch is +/// eliminated. Reuses the shared [`bswap_u16x8_if_be`](super::bswap_u16x8_if_be) +/// helper so both lanes go through the same target-aware path. #[inline(always)] unsafe fn deinterleave_endian(pair: uint16x8x2_t) -> uint16x8x2_t { - if BE { - unsafe { uint16x8x2_t(byteswap_u16x8(pair.0), byteswap_u16x8(pair.1)) } - } else { - pair + unsafe { + uint16x8x2_t( + bswap_u16x8_if_be::(pair.0), + bswap_u16x8_if_be::(pair.1), + ) } } diff --git a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs index 2367e9ef..7a4aa916 100644 --- a/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/neon/subsampled_high_bit_pn_4_4_4.rs @@ -14,24 +14,21 @@ use super::*; // register, like `nv24_to_rgb_row`. Each iteration consumes 16 Y // pixels and 32 UV `u16` elements (= 16 interleaved U/V pairs). -/// Byte-swap every u16 lane in `v` (BE ↔ LE conversion in-register). -/// -/// Equivalent to `vrev16q_u8` on the reinterpreted byte view. Used -/// after `vld2q_u16` to apply per-lane byte-swapping that -/// `load_endian_u16x8` cannot perform for interleaved loads. -#[inline(always)] -unsafe fn byteswap_u16x8(v: uint16x8_t) -> uint16x8_t { - unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } -} - -/// Apply BE byte-swap to a `uint16x8x2_t` pair (each lane individually). -/// When `BE = false` this is a no-op and compiles away entirely. +/// Apply per-lane byte-swap to a `uint16x8x2_t` pair when the source +/// (wire) endian differs from the host's native u16 byte order. +/// +/// `vld2q_u16` materializes lanes in the **host-native** u16 byte order +/// regardless of the wire encoding, so the swap must trigger on +/// `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88 fixes). +/// See [`bswap_u16x8_if_be`](super::bswap_u16x8_if_be) for the truth +/// table; this wrapper just applies it to both UV halves. #[inline(always)] unsafe fn deinterleave_endian(pair: uint16x8x2_t) -> uint16x8x2_t { - if BE { - unsafe { uint16x8x2_t(byteswap_u16x8(pair.0), byteswap_u16x8(pair.1)) } - } else { - pair + unsafe { + uint16x8x2_t( + bswap_u16x8_if_be::(pair.0), + bswap_u16x8_if_be::(pair.1), + ) } } diff --git a/src/row/arch/neon/tests/be_parity.rs b/src/row/arch/neon/tests/be_parity.rs index f774fac8..6bffbcd3 100644 --- a/src/row/arch/neon/tests/be_parity.rs +++ b/src/row/arch/neon/tests/be_parity.rs @@ -1,12 +1,17 @@ //! BE parity tests for NEON high-bit YUV / P-format kernels. //! -//! Each test takes a randomized LE input buffer, byte-swaps every u16 -//! element to produce a BE-encoded buffer, then asserts that -//! `kernel::(swapped_input)` produces byte-identical output -//! to `kernel::(original_input)`. This is the formal -//! parity contract for the BE-aware kernels: BE input is a swapped -//! representation of the same logical pixel data, so the output must -//! match. +//! Each test constructs the LE and BE input buffers from raw bytes via +//! `to_le_bytes` / `to_be_bytes` (then `from_ne_bytes` to reinterpret +//! as host-native `u16`), so the byte-level encoding is host-independent: +//! on every host (LE or BE), `*_le` carries the intended u16 values as +//! LE-encoded bytes and `*_be` carries the same intended values as +//! BE-encoded bytes. Both kernels therefore decode to the intended +//! host-native u16 samples and must produce byte-identical output. +//! +//! The earlier `swap_bytes` pattern was vacuous on BE hosts (both +//! `kernel::` and `kernel::` produced equal-but-wrong +//! outputs and the assert passed without exercising the BE-host +//! decode path). Mirrors PR #82 / #85 / #87 / #88 fixture rewrites. use crate::row::neon_available; @@ -15,8 +20,24 @@ use super::{ planar_n_plane, }; -fn byteswap_u16_buf(buf: &[u16]) -> std::vec::Vec { - buf.iter().map(|x| x.swap_bytes()).collect() +/// Reinterpret an intended-u16 buffer as host-native `u16` carrying +/// LE-encoded bytes. On LE hosts this is the identity; on BE hosts it +/// stores each value with its bytes swapped vs. host-native order. +fn as_le_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf + .iter() + .map(|v| u16::from_ne_bytes(v.to_le_bytes())) + .collect() +} + +/// Reinterpret an intended-u16 buffer as host-native `u16` carrying +/// BE-encoded bytes. On BE hosts this is the identity; on LE hosts it +/// stores each value with its bytes swapped vs. host-native order. +fn as_be_u16_buf(buf: &[u16]) -> std::vec::Vec { + buf + .iter() + .map(|v| u16::from_ne_bytes(v.to_be_bytes())) + .collect() } // ---- yuv_420p_n (planar 4:2:0 high-bit) ----------------------------- @@ -27,19 +48,30 @@ fn neon_yuv_420p10_be_parity_u8() { return; } let width = 32; - let y = planar_n_plane::<10>(width, 13); - let u = planar_n_plane::<10>(width / 2, 17); - let v = planar_n_plane::<10>(width / 2, 19); - let y_be = byteswap_u16_buf(&y); - let u_be = byteswap_u16_buf(&u); - let v_be = byteswap_u16_buf(&v); + let y_intended = planar_n_plane::<10>(width, 13); + let u_intended = planar_n_plane::<10>(width / 2, 17); + let v_intended = planar_n_plane::<10>(width / 2, 19); + let y_le = as_le_u16_buf(&y_intended); + let u_le = as_le_u16_buf(&u_intended); + let v_le = as_le_u16_buf(&v_intended); + let y_be = as_be_u16_buf(&y_intended); + let u_be = as_be_u16_buf(&u_intended); + let v_be = as_be_u16_buf(&v_intended); for matrix in [ColorMatrix::Bt709, ColorMatrix::Bt2020Ncl] { for full_range in [true, false] { let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - yuv_420p_n_to_rgb_row::<10, false>(&y, &u, &v, &mut out_le, width, matrix, full_range); + yuv_420p_n_to_rgb_row::<10, false>( + &y_le, + &u_le, + &v_le, + &mut out_le, + width, + matrix, + full_range, + ); yuv_420p_n_to_rgb_row::<10, true>( &y_be, &u_be, @@ -61,20 +93,23 @@ fn neon_yuv_420p10_be_parity_u16() { return; } let width = 32; - let y = planar_n_plane::<10>(width, 23); - let u = planar_n_plane::<10>(width / 2, 29); - let v = planar_n_plane::<10>(width / 2, 31); - let y_be = byteswap_u16_buf(&y); - let u_be = byteswap_u16_buf(&u); - let v_be = byteswap_u16_buf(&v); + let y_intended = planar_n_plane::<10>(width, 23); + let u_intended = planar_n_plane::<10>(width / 2, 29); + let v_intended = planar_n_plane::<10>(width / 2, 31); + let y_le = as_le_u16_buf(&y_intended); + let u_le = as_le_u16_buf(&u_intended); + let v_le = as_le_u16_buf(&v_intended); + let y_be = as_be_u16_buf(&y_intended); + let u_be = as_be_u16_buf(&u_intended); + let v_be = as_be_u16_buf(&v_intended); let mut out_le = std::vec![0u16; width * 3]; let mut out_be = std::vec![0u16; width * 3]; unsafe { yuv_420p_n_to_rgb_u16_row::<10, false>( - &y, - &u, - &v, + &y_le, + &u_le, + &v_le, &mut out_le, width, ColorMatrix::Bt709, @@ -101,17 +136,28 @@ fn neon_yuv_444p12_be_parity_u8() { return; } let width = 32; - let y = planar_n_plane::<12>(width, 41); - let u = planar_n_plane::<12>(width, 43); - let v = planar_n_plane::<12>(width, 47); - let y_be = byteswap_u16_buf(&y); - let u_be = byteswap_u16_buf(&u); - let v_be = byteswap_u16_buf(&v); + let y_intended = planar_n_plane::<12>(width, 41); + let u_intended = planar_n_plane::<12>(width, 43); + let v_intended = planar_n_plane::<12>(width, 47); + let y_le = as_le_u16_buf(&y_intended); + let u_le = as_le_u16_buf(&u_intended); + let v_le = as_le_u16_buf(&v_intended); + let y_be = as_be_u16_buf(&y_intended); + let u_be = as_be_u16_buf(&u_intended); + let v_be = as_be_u16_buf(&v_intended); let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - yuv_444p_n_to_rgb_row::<12, false>(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p_n_to_rgb_row::<12, false>( + &y_le, + &u_le, + &v_le, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); yuv_444p_n_to_rgb_row::<12, true>( &y_be, &u_be, @@ -133,17 +179,28 @@ fn neon_yuv_420p16_be_parity_u8() { return; } let width = 32; - let y = p16_plane_neon(width, 53); - let u = p16_plane_neon(width / 2, 59); - let v = p16_plane_neon(width / 2, 61); - let y_be = byteswap_u16_buf(&y); - let u_be = byteswap_u16_buf(&u); - let v_be = byteswap_u16_buf(&v); + let y_intended = p16_plane_neon(width, 53); + let u_intended = p16_plane_neon(width / 2, 59); + let v_intended = p16_plane_neon(width / 2, 61); + let y_le = as_le_u16_buf(&y_intended); + let u_le = as_le_u16_buf(&u_intended); + let v_le = as_le_u16_buf(&v_intended); + let y_be = as_be_u16_buf(&y_intended); + let u_be = as_be_u16_buf(&u_intended); + let v_be = as_be_u16_buf(&v_intended); let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - yuv_420p16_to_rgb_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_420p16_to_rgb_row::( + &y_le, + &u_le, + &v_le, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); yuv_420p16_to_rgb_row::( &y_be, &u_be, @@ -163,17 +220,28 @@ fn neon_yuv_444p16_be_parity_u16() { return; } let width = 32; - let y = p16_plane_neon(width, 67); - let u = p16_plane_neon(width, 71); - let v = p16_plane_neon(width, 73); - let y_be = byteswap_u16_buf(&y); - let u_be = byteswap_u16_buf(&u); - let v_be = byteswap_u16_buf(&v); + let y_intended = p16_plane_neon(width, 67); + let u_intended = p16_plane_neon(width, 71); + let v_intended = p16_plane_neon(width, 73); + let y_le = as_le_u16_buf(&y_intended); + let u_le = as_le_u16_buf(&u_intended); + let v_le = as_le_u16_buf(&v_intended); + let y_be = as_be_u16_buf(&y_intended); + let u_be = as_be_u16_buf(&u_intended); + let v_be = as_be_u16_buf(&v_intended); let mut out_le = std::vec![0u16; width * 3]; let mut out_be = std::vec![0u16; width * 3]; unsafe { - yuv_444p16_to_rgb_u16_row::(&y, &u, &v, &mut out_le, width, ColorMatrix::Bt709, true); + yuv_444p16_to_rgb_u16_row::( + &y_le, + &u_le, + &v_le, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); yuv_444p16_to_rgb_u16_row::( &y_be, &u_be, @@ -188,6 +256,12 @@ fn neon_yuv_444p16_be_parity_u16() { } // ---- p_n / p_n_444 (semi-planar high-bit-packed) -------------------- +// +// The 4:2:0 (`p_n_to_*`) and 4:4:4 (`p_n_444_to_*`) NEON kernels deinterleave +// UV via `vld2q_u16`, which materializes lanes in host-native order. Their +// per-lane byte-swap therefore must trigger on `BE != HOST_NATIVE_BE`, not +// just `BE`. These regression tests catch the BE-host miscompile fixed in +// this patch (codex review on PR #89). #[test] fn neon_p010_be_parity_u8() { @@ -195,83 +269,178 @@ fn neon_p010_be_parity_u8() { return; } let width = 32; - let y = p_n_packed_plane::<10>(width, 79); + let y_intended = p_n_packed_plane::<10>(width, 79); let u_half = p_n_packed_plane::<10>(width / 2, 83); let v_half = p_n_packed_plane::<10>(width / 2, 89); - let uv_half = p010_uv_interleave(&u_half, &v_half); - let y_be = byteswap_u16_buf(&y); - let uv_be = byteswap_u16_buf(&uv_half); + let uv_intended = p010_uv_interleave(&u_half, &v_half); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - p_n_to_rgb_row::<10, false>(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_row::<10, false>(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); p_n_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); } assert_eq!(out_le, out_be); } +#[test] +fn neon_p012_be_parity_u16() { + if !neon_available() { + return; + } + // Native-depth u16 output exercises `p_n_to_rgb_or_rgba_u16_row`, + // the second `vld2q_u16` + `deinterleave_endian::` site in + // `subsampled_high_bit_pn_4_2_0.rs` (line 366 in the codex finding). + let width = 32; + let y_intended = p_n_packed_plane::<12>(width, 149); + let u_half = p_n_packed_plane::<12>(width / 2, 151); + let v_half = p_n_packed_plane::<12>(width / 2, 157); + let uv_intended = p010_uv_interleave(&u_half, &v_half); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_to_rgb_u16_row::<12, false>(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_to_rgb_u16_row::<12, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + #[test] fn neon_p410_be_parity_u8() { if !neon_available() { return; } let width = 32; - let y = p_n_packed_plane::<10>(width, 97); + let y_intended = p_n_packed_plane::<10>(width, 97); let u_full = high_bit_plane::<10>(width, 101); let v_full = high_bit_plane::<10>(width, 103); - let uv_full = interleave_uv(&u_full, &v_full); - let y_be = byteswap_u16_buf(&y); - let uv_be = byteswap_u16_buf(&uv_full); + let uv_intended = interleave_uv(&u_full, &v_full); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - p_n_444_to_rgb_row::<10, false>(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_to_rgb_row::<10, false>(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); p_n_444_to_rgb_row::<10, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); } assert_eq!(out_le, out_be); } +#[test] +fn neon_p412_be_parity_u16() { + if !neon_available() { + return; + } + // Native-depth u16 output exercises `p_n_444_to_rgb_or_rgba_u16_row`, + // the second `vld2q_u16` + `deinterleave_endian::` site in + // `subsampled_high_bit_pn_4_4_4.rs` (line 360-363 in the codex finding). + let width = 32; + let y_intended = p_n_packed_plane::<12>(width, 163); + let u_full = high_bit_plane::<12>(width, 167); + let v_full = high_bit_plane::<12>(width, 173); + let uv_intended = interleave_uv(&u_full, &v_full); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p_n_444_to_rgb_u16_row::<12, false>( + &y_le, + &uv_le, + &mut out_le, + width, + ColorMatrix::Bt709, + true, + ); + p_n_444_to_rgb_u16_row::<12, true>(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + #[test] fn neon_p016_be_parity_u8() { if !neon_available() { return; } let width = 32; - let y = p16_plane_neon(width, 107); + let y_intended = p16_plane_neon(width, 107); let u_half = p16_plane_neon(width / 2, 109); let v_half = p16_plane_neon(width / 2, 113); - let uv_half = p010_uv_interleave(&u_half, &v_half); - let y_be = byteswap_u16_buf(&y); - let uv_be = byteswap_u16_buf(&uv_half); + let uv_intended = p010_uv_interleave(&u_half, &v_half); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); let mut out_le = std::vec![0u8; width * 3]; let mut out_be = std::vec![0u8; width * 3]; unsafe { - p16_to_rgb_row::(&y, &uv_half, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_row::(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); p16_to_rgb_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); } assert_eq!(out_le, out_be); } +#[test] +fn neon_p016_be_parity_u16() { + if !neon_available() { + return; + } + // Native-depth u16 output exercises `p16_to_rgb_or_rgba_u16_row`, + // a third `vld2q_u16` + `deinterleave_endian::` site (line 716). + let width = 32; + let y_intended = p16_plane_neon(width, 179); + let u_half = p16_plane_neon(width / 2, 181); + let v_half = p16_plane_neon(width / 2, 191); + let uv_intended = p010_uv_interleave(&u_half, &v_half); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); + + let mut out_le = std::vec![0u16; width * 3]; + let mut out_be = std::vec![0u16; width * 3]; + unsafe { + p16_to_rgb_u16_row::(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); + p16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); + } + assert_eq!(out_le, out_be); +} + #[test] fn neon_p416_be_parity_u16() { if !neon_available() { return; } let width = 32; - let y = p16_plane_neon(width, 127); + let y_intended = p16_plane_neon(width, 127); let u_full = p16_plane_neon(width, 131); let v_full = p16_plane_neon(width, 137); - let uv_full = interleave_uv(&u_full, &v_full); - let y_be = byteswap_u16_buf(&y); - let uv_be = byteswap_u16_buf(&uv_full); + let uv_intended = interleave_uv(&u_full, &v_full); + let y_le = as_le_u16_buf(&y_intended); + let uv_le = as_le_u16_buf(&uv_intended); + let y_be = as_be_u16_buf(&y_intended); + let uv_be = as_be_u16_buf(&uv_intended); let mut out_le = std::vec![0u16; width * 3]; let mut out_be = std::vec![0u16; width * 3]; unsafe { - p_n_444_16_to_rgb_u16_row::(&y, &uv_full, &mut out_le, width, ColorMatrix::Bt709, true); + p_n_444_16_to_rgb_u16_row::(&y_le, &uv_le, &mut out_le, width, ColorMatrix::Bt709, true); p_n_444_16_to_rgb_u16_row::(&y_be, &uv_be, &mut out_be, width, ColorMatrix::Bt709, true); } assert_eq!(out_le, out_be); diff --git a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs index 27376bc3..01873cc6 100644 --- a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_2_0.rs @@ -2,13 +2,20 @@ use core::arch::wasm32::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `wasm32` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// -/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping -/// for BE input. When `BE = false` this compiles away entirely. +/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping. +/// Gated on `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88) +/// so a hypothetical BE-wasm host would not double-swap. When the gate +/// folds to `false` at compile time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x8(v: v128) -> v128 { - if BE { + if BE != HOST_NATIVE_BE { let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); u8x16_swizzle(v, mask) } else { diff --git a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs index 8a735757..bd97be8f 100644 --- a/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/wasm_simd128/subsampled_high_bit_pn_4_4_4.rs @@ -2,13 +2,20 @@ use core::arch::wasm32::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `wasm32` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// -/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping -/// for BE input. When `BE = false` this compiles away entirely. +/// Used after `deinterleave_uv_u16_wasm` to apply per-lane byte-swapping. +/// Gated on `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88) +/// so a hypothetical BE-wasm host would not double-swap. When the gate +/// folds to `false` at compile time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x8(v: v128) -> v128 { - if BE { + if BE != HOST_NATIVE_BE { let mask = i8x16(1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14); u8x16_swizzle(v, mask) } else { diff --git a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs index 677af1bc..2503a52c 100644 --- a/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_avx2/subsampled_high_bit_pn_4_2_0.rs @@ -2,13 +2,20 @@ use core::arch::x86_64::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `x86_64` / `i686` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// -/// Used after `deinterleave_uv_u16_avx2` to apply per-lane byte-swapping -/// for BE input. When `BE = false` this compiles away entirely. +/// Used after `deinterleave_uv_u16_avx2` to apply per-lane byte-swapping. +/// Gated on `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88) +/// so a hypothetical BE-x86 host would not double-swap. When the gate +/// folds to `false` at compile time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x16(v: __m256i) -> __m256i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 32], __m256i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, @@ -724,8 +731,11 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row(v: __m256i) -> __m256i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 32], __m256i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, diff --git a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs index dbf38db2..6704638d 100644 --- a/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_avx512/subsampled_high_bit_pn_4_2_0.rs @@ -2,13 +2,20 @@ use core::arch::x86_64::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `x86_64` / `i686` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// -/// Used after `deinterleave_uv_u16_avx512` to apply per-lane byte-swapping -/// for BE input. When `BE = false` this compiles away entirely. +/// Used after `deinterleave_uv_u16_avx512` to apply per-lane byte-swapping. +/// Gated on `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88) +/// so a hypothetical BE-x86 host would not double-swap. When the gate +/// folds to `false` at compile time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x32(v: __m512i) -> __m512i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 64], __m512i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, @@ -706,9 +713,11 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row(v: __m512i) -> __m512i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 64], __m512i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, diff --git a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs index 38a914d2..d962e907 100644 --- a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs +++ b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_2_0.rs @@ -2,15 +2,22 @@ use core::arch::x86_64::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `x86_64` / `i686` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// /// Used after `deinterleave_uv_u16` (or other UV-interleaved loads) to -/// apply the BE byte-swap that `load_endian_u16x8` cannot perform for -/// shuffled-then-loaded values. When `BE = false` this compiles away -/// entirely. +/// apply byte-swapping that `load_endian_u16x8` cannot perform for +/// shuffled-then-loaded values. The gate is `BE != HOST_NATIVE_BE` +/// (mirrors PR #82 / #85 / #87 / #88), so a hypothetical BE-x86 host +/// would not double-swap. When the gate folds to `false` at compile +/// time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x8(v: __m128i) -> __m128i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 16], __m128i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, @@ -684,9 +691,12 @@ pub(crate) unsafe fn p16_to_rgb_or_rgba_u16_row= width >= x + 8 guarantees 8 u16 readable. let uv_raw = _mm_loadu_si128(uv_half.as_ptr().add(x).cast()); // [U0,V0,U1,V1,U2,V2,U3,V3] → [U0,U1,U2,U3, V0,V1,V2,V3]. - // For BE: also swap the two bytes within each u16 lane (lo/hi - // byte indices within each 16-bit element flipped). - let split_mask = if BE { + // When wire endian differs from host (`BE != HOST_NATIVE_BE`): + // also swap the two bytes within each u16 lane (lo/hi byte + // indices within each 16-bit element flipped). On a hypothetical + // BE-x86 host this avoids the double-swap that a plain `BE` + // gate would introduce. + let split_mask = if BE != HOST_NATIVE_BE { _mm_setr_epi8(1, 0, 5, 4, 9, 8, 13, 12, 3, 2, 7, 6, 11, 10, 15, 14) } else { _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15) diff --git a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs index 7bfe9e6c..f6c41f47 100644 --- a/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs +++ b/src/row/arch/x86_sse41/subsampled_high_bit_pn_4_4_4.rs @@ -2,13 +2,20 @@ use core::arch::x86_64::*; use super::*; -/// Byte-swap every u16 lane of `v` in-register (BE ↔ LE conversion). +/// Compile-time host endianness. `true` on BE targets, `false` on LE +/// targets (always `false` on `x86_64` / `i686` in practice). +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Byte-swap every u16 lane of `v` in-register when the source (wire) +/// endian differs from the host's native u16 byte order. /// -/// Used after `deinterleave_uv_u16` to apply per-lane byte-swapping -/// for BE input. When `BE = false` this compiles away entirely. +/// Used after `deinterleave_uv_u16` to apply per-lane byte-swapping. +/// Gated on `BE != HOST_NATIVE_BE` (mirrors PR #82 / #85 / #87 / #88) +/// so a hypothetical BE-x86 host would not double-swap. When the gate +/// folds to `false` at compile time, the call compiles away entirely. #[inline(always)] unsafe fn byteswap_u16x8(v: __m128i) -> __m128i { - if BE { + if BE != HOST_NATIVE_BE { let mask = unsafe { core::mem::transmute::<[u8; 16], __m128i>([ 1, 0, 3, 2, 5, 4, 7, 6, 9, 8, 11, 10, 13, 12, 15, 14, From b9a6c195618a512fbf57f7baa8593d51f9e7b404 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 13:50:42 +1200 Subject: [PATCH 5/8] fix(be-yuv-hb): normalize u16::from_le before validation in high-bit Frame constructors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-2 review on PR #89 flagged two high-severity findings: - src/frame/subsampled_high_bit_pn.rs:239-240 — Pn (P010/P012/P016, P210/P212/P216, P410/P412/P416) `try_new_checked` low-bit validators tested raw `u16` samples, so on a BE host a valid LE-encoded P010 white sample 0xFFC0 reads back as host-native 0xC0FF and the low-6- bits-zero check falsely rejected every row. - src/frame/subsampled_high_bit_planar.rs:268-269 — planar high-bit (Yuv420p/Yuv422p/Yuv444p/Yuv440p at BITS ∈ {10,12,14,16}) `try_new_checked` range validators tested raw `u16` samples, so on a BE host a valid LE-encoded yuv420p10le sample 1023 (bytes [0xFF, 0x03]) reads back as 0xFF03 and was rejected as out of range. The frame types document the **LE-encoded byte layout** contract (the `&[u16]` plane is the FFmpeg `*LE` byte buffer reinterpreted as `u16`). Normalize each sample with `u16::from_le` before the bit/range check so the validator operates on the intended logical sample value on every host. On LE hosts `from_le` is a no-op; on BE hosts it byte- swaps each `u16` back into host-native form. Mirrors the established `Y2xxFrame::try_new_checked` pattern already merged on main. Affected validators (every checked-constructor sample-scan loop in this PR's scope is patched): - subsampled_high_bit_planar.rs: Yuv420pFrame16 (4:2:0), Yuv422pFrame16 (4:2:2), Yuv444pFrame16 (4:4:4), Yuv440pFrame16 (4:4:0). - subsampled_high_bit_pn.rs: PnFrame (P010/P012/P016), PnFrame422 (P210/P212/P216), PnFrame444 (P410/P412/P416). The reported `value` in `SampleOutOfRange` / `SampleLowBitsSet` errors is the normalized logical sample so callers can match it against the declared `max_valid` / `low_bits` regardless of host endianness. Tests: add 12 host-independent BE-host regression tests across the three subsampled high-bit test modules, building each plane from LE-encoded bytes via `to_le_bytes` and reading back via `from_ne_bytes` (no `cfg(target_endian = "little")` gate). Each frame family gets one positive case (valid LE buffer that would be rejected without `from_le` on a BE host) and a negative case (out-of-range sample that the validator must still surface after normalization). Covers Yuv420p10/P010/P012, Yuv422p10/Yuv422p12/P210, Yuv444p10/ Yuv444p14/Yuv440p10/Yuv440p12/P410. Audit confirmed there are no other in-scope sample-scan validators on this branch: row-kernel changes touch only the planar high-bit and Pn families. Out-of-scope frame families (Xv36, Bayer16, Yuva*) have not yet had their LE-byte contracts wired through their row kernels on this branch and will be addressed in follow-up BE-support PRs. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/subsampled_high_bit_planar.rs | 91 +++++++++++---- src/frame/subsampled_high_bit_pn.rs | 55 +++++++-- src/frame/tests/subsampled_4_2_0_high_bit.rs | 114 +++++++++++++++++++ src/frame/tests/subsampled_4_2_2_high_bit.rs | 84 ++++++++++++++ src/frame/tests/subsampled_4_4_4_high_bit.rs | 114 +++++++++++++++++++ 5 files changed, 422 insertions(+), 36 deletions(-) diff --git a/src/frame/subsampled_high_bit_planar.rs b/src/frame/subsampled_high_bit_planar.rs index 2022e085..1f21c0ae 100644 --- a/src/frame/subsampled_high_bit_planar.rs +++ b/src/frame/subsampled_high_bit_planar.rs @@ -241,6 +241,20 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { /// within that plane's slice, offending value, and the valid /// maximum so the caller can pinpoint the bad sample. All of /// [`Self::try_new`]'s geometry errors are still possible. + /// + /// Per the LE-encoded byte contract documented on the type, samples + /// are validated **after** `u16::from_le` normalization so the range + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form before the comparison. Without + /// this normalization a valid `yuv420p10le` plane on a BE host would + /// have its samples appear byte-swapped (e.g. `1023` encoded LE as + /// bytes `[0xFF, 0x03]` reads as host-native `0xFF03` on BE) and the + /// validator would falsely reject every row. The reported `value` in + /// the error is the normalized logical sample so callers can match it + /// against the declared `max_valid`. Mirrors the `Y2xxFrame::try_new_checked` + /// pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn try_new_checked( @@ -266,11 +280,14 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + // Normalize from LE-encoded wire to host-native before the + // range check (no-op on LE host, byte-swap on BE host). + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -279,11 +296,12 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * u_stride as usize; for (col, &s) in u[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -292,11 +310,12 @@ impl<'a, const BITS: u32> Yuv420pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * v_stride as usize; for (col, &s) in v[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -680,6 +699,11 @@ impl<'a, const BITS: u32> Yuv422pFrame16<'a, BITS> { /// `u16` value is valid) — same convention as /// [`Yuv420pFrame16::try_new_checked`]. /// + /// Per the LE-encoded byte contract on the type, samples are validated + /// **after** `u16::from_le` normalization so the range check operates + /// on the intended logical sample on both LE and BE hosts. See + /// [`Yuv420pFrame16::try_new_checked`] for the full rationale. + /// /// Cost: one O(plane_size) linear scan per plane. The default /// [`Self::try_new`] skips this so the hot path (decoder output, /// already-conforming buffers) stays O(1). @@ -708,11 +732,12 @@ impl<'a, const BITS: u32> Yuv422pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -721,11 +746,12 @@ impl<'a, const BITS: u32> Yuv422pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * u_stride as usize; for (col, &s) in u[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -734,11 +760,12 @@ impl<'a, const BITS: u32> Yuv422pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * v_stride as usize; for (col, &s) in v[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -948,6 +975,11 @@ impl<'a, const BITS: u32> Yuv444pFrame16<'a, BITS> { /// `u16` value is valid) — same convention as /// [`Yuv420pFrame16::try_new_checked`]. /// + /// Per the LE-encoded byte contract on the type, samples are validated + /// **after** `u16::from_le` normalization so the range check operates + /// on the intended logical sample on both LE and BE hosts. See + /// [`Yuv420pFrame16::try_new_checked`] for the full rationale. + /// /// Cost: one O(plane_size) linear scan per plane. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -972,11 +1004,12 @@ impl<'a, const BITS: u32> Yuv444pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -985,11 +1018,12 @@ impl<'a, const BITS: u32> Yuv444pFrame16<'a, BITS> { for row in 0..h { let start = row * u_stride as usize; for (col, &s) in u[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -998,11 +1032,12 @@ impl<'a, const BITS: u32> Yuv444pFrame16<'a, BITS> { for row in 0..h { let start = row * v_stride as usize; for (col, &s) in v[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -1228,6 +1263,11 @@ impl<'a, const BITS: u32> Yuv440pFrame16<'a, BITS> { /// produce silently wrong output. Use this constructor on untrusted /// inputs (custom decoders, unchecked FFI buffers, etc.). /// + /// Per the LE-encoded byte contract on the type, samples are validated + /// **after** `u16::from_le` normalization so the range check operates + /// on the intended logical sample on both LE and BE hosts. See + /// [`Yuv420pFrame16::try_new_checked`] for the full rationale. + /// /// Cost: one O(plane_size) linear scan per plane. The chroma planes /// here are full-width × half-height (4:4:0 layout). #[cfg_attr(not(tarpaulin), inline(always))] @@ -1253,11 +1293,12 @@ impl<'a, const BITS: u32> Yuv440pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -1266,11 +1307,12 @@ impl<'a, const BITS: u32> Yuv440pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * u_stride as usize; for (col, &s) in u[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -1279,11 +1321,12 @@ impl<'a, const BITS: u32> Yuv440pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * v_stride as usize; for (col, &s) in v[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuv420pFrame16Error::SampleOutOfRange { plane: Yuv420pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } diff --git a/src/frame/subsampled_high_bit_pn.rs b/src/frame/subsampled_high_bit_pn.rs index 77dc9867..c1545366 100644 --- a/src/frame/subsampled_high_bit_pn.rs +++ b/src/frame/subsampled_high_bit_pn.rs @@ -218,6 +218,19 @@ impl<'a, const BITS: u32> PnFrame<'a, BITS> { /// Returns [`PnFrameError::SampleLowBitsSet`] on the first /// offending sample — carries the plane, element index, offending /// value, and the number of low bits expected to be zero. + /// + /// Per the LE-encoded byte contract on the type-level docs, samples + /// are validated **after** `u16::from_le` normalization so the bit + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form. Without this normalization a + /// valid `P010LE` plane on a BE host would have its MSB-aligned + /// samples appear byte-swapped (e.g. white = `0xFFC0` LE-encoded + /// reads as host-native `0xC0FF` on BE, with the active bits in the + /// low byte) and the validator would falsely reject every row. The + /// reported `value` in the error is the normalized logical sample. + /// Mirrors the `Y2xxFrame::try_new_checked` pattern. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( y: &'a [u16], @@ -237,11 +250,14 @@ impl<'a, const BITS: u32> PnFrame<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s & low_mask != 0 { + // Normalize from LE-encoded wire to host-native before the + // bit check (no-op on LE host, byte-swap on BE host). + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Y, index: start + col, - value: s, + value: logical, low_bits, }); } @@ -250,11 +266,12 @@ impl<'a, const BITS: u32> PnFrame<'a, BITS> { for row in 0..chroma_h { let start = row * uv_stride as usize; for (col, &s) in uv[start..start + uv_w].iter().enumerate() { - if s & low_mask != 0 { + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Uv, index: start + col, - value: s, + value: logical, low_bits, }); } @@ -482,6 +499,11 @@ impl<'a, const BITS: u32> PnFrame422<'a, BITS> { /// rejects any whose low `16 - BITS` bits are non-zero. See /// [`PnFrame::try_new_checked`] for the full discussion of catch /// rates and limitations at each `BITS`. + /// + /// Per the LE-encoded byte contract on the type, samples are + /// validated **after** `u16::from_le` normalization so the bit check + /// operates on the intended logical sample on both LE and BE hosts. + /// See [`PnFrame::try_new_checked`] for the full rationale. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( y: &'a [u16], @@ -503,11 +525,12 @@ impl<'a, const BITS: u32> PnFrame422<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s & low_mask != 0 { + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Y, index: start + col, - value: s, + value: logical, low_bits, }); } @@ -517,11 +540,12 @@ impl<'a, const BITS: u32> PnFrame422<'a, BITS> { for row in 0..h { let start = row * uv_stride as usize; for (col, &s) in uv[start..start + uv_w].iter().enumerate() { - if s & low_mask != 0 { + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Uv, index: start + col, - value: s, + value: logical, low_bits, }); } @@ -717,6 +741,11 @@ impl<'a, const BITS: u32> PnFrame444<'a, BITS> { /// rejects any whose low `16 - BITS` bits are non-zero. See /// [`PnFrame::try_new_checked`] for the full discussion of catch /// rates and limitations. + /// + /// Per the LE-encoded byte contract on the type, samples are + /// validated **after** `u16::from_le` normalization so the bit check + /// operates on the intended logical sample on both LE and BE hosts. + /// See [`PnFrame::try_new_checked`] for the full rationale. #[cfg_attr(not(tarpaulin), inline(always))] pub fn try_new_checked( y: &'a [u16], @@ -738,11 +767,12 @@ impl<'a, const BITS: u32> PnFrame444<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s & low_mask != 0 { + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Y, index: start + col, - value: s, + value: logical, low_bits, }); } @@ -751,11 +781,12 @@ impl<'a, const BITS: u32> PnFrame444<'a, BITS> { for row in 0..h { let start = row * uv_stride as usize; for (col, &s) in uv[start..start + uv_w].iter().enumerate() { - if s & low_mask != 0 { + let logical = u16::from_le(s); + if logical & low_mask != 0 { return Err(PnFrameError::SampleLowBitsSet { plane: PnFramePlane::Uv, index: start + col, - value: s, + value: logical, low_bits, }); } diff --git a/src/frame/tests/subsampled_4_2_0_high_bit.rs b/src/frame/tests/subsampled_4_2_0_high_bit.rs index 13619dc7..bccf0781 100644 --- a/src/frame/tests/subsampled_4_2_0_high_bit.rs +++ b/src/frame/tests/subsampled_4_2_0_high_bit.rs @@ -535,3 +535,117 @@ fn p012_try_new_checked_accepts_low_packed_flat_content_by_design() { // corruption. The type system, not `try_new_checked`, must // guarantee provenance for 12-bit. } + +// ---- Host-independent BE-host regressions (codex round-2) ----------- +// +// These tests build the planes explicitly from LE-encoded bytes via +// `to_le_bytes` and read back as `&[u16]` via `from_ne_bytes`. On an +// LE host the resulting `u16` values are identical to the intended +// literals; on a BE host every `u16` is byte-swapped relative to the +// intent, exercising the `u16::from_le` normalization inside the +// validators. Without that normalization the validators would falsely +// reject every valid LE-encoded plane on a BE host. +// +// Each family covers (1) a positive case — a logical LE buffer of +// valid samples that must be accepted on both LE and BE hosts — and +// (2) a negative case where a sample is invalid even after `from_le` +// normalization, ensuring the validator still surfaces real errors. + +/// Build a `Vec` representing the LE-encoded byte layout of +/// `intended` (i.e., what FFmpeg would emit on the wire). On an LE +/// host the result equals `intended` element-wise; on a BE host every +/// element is byte-swapped relative to `intended`. +fn le_encoded_u16_buf(intended: &[u16]) -> std::vec::Vec { + let bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect() +} + +#[test] +fn yuv420p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 10-bit-low-packed white = 1023 (LE bytes [0xFF, 0x03]). + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8) + .expect("LE-encoded valid yuv420p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuv420p10_try_new_checked_rejects_le_encoded_out_of_range_on_any_host() { + // After `u16::from_le` normalization the offending sample is 1024 + // (just above the 10-bit max of 1023). On both LE and BE hosts the + // validator must catch this — the LE-encoded byte buffer carries the + // logical value 1024 in `u[2 * 8 + 3]`. + let intended_y = std::vec![0u16; 16 * 8]; + let mut intended_u = std::vec![512u16; 8 * 4]; + intended_u[2 * 8 + 3] = 1024; + let intended_v = std::vec![512u16; 8 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_u); + let v = le_encoded_u16_buf(&intended_v); + let e = Yuv420p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::U, + value: 1024, + max_valid: 1023, + .. + } + )); +} + +#[test] +fn p010_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // P010 white = 1023 << 6 = 0xFFC0; LE bytes [0xC0, 0xFF]. On a BE + // host these bytes read back as host-native 0xC0FF (low 6 bits = + // 0x3F) — the validator's `from_le` normalization must recover the + // intended 0xFFC0 before the low-bits check. + let intended_y = std::vec![0xFFC0u16; 16 * 8]; + let intended_uv = std::vec![0x8000u16; 16 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("LE-encoded valid P010 must be accepted on both LE and BE hosts"); +} + +#[test] +fn p010_try_new_checked_rejects_le_encoded_low_bits_on_any_host() { + // After `u16::from_le` normalization, a logical 0x03FF has all six + // low bits set — characteristic of `yuv420p10le` data accidentally + // handed to the P010 constructor. The validator must reject this on + // both LE and BE hosts. + let mut intended_y = std::vec![0xFFC0u16; 16 * 8]; + intended_y[3 * 16 + 5] = 0x03FF; + let intended_uv = std::vec![0x8000u16; 16 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + let e = P010Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!( + e, + PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Y, + value: 0x03FF, + low_bits: 6, + .. + } + )); +} + +#[test] +fn p012_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // P012 mid-gray = 2048 << 4 = 0x8000; LE bytes [0x00, 0x80]. On a BE + // host these read back as host-native 0x0080 — the validator must + // `from_le` to recover 0x8000 before the low-4-bits check. + let intended_y = std::vec![(2048u16) << 4; 16 * 8]; + let intended_uv = std::vec![(2048u16) << 4; 16 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + P012Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("LE-encoded valid P012 must be accepted on both LE and BE hosts"); +} diff --git a/src/frame/tests/subsampled_4_2_2_high_bit.rs b/src/frame/tests/subsampled_4_2_2_high_bit.rs index d5ff6b08..8095214f 100644 --- a/src/frame/tests/subsampled_4_2_2_high_bit.rs +++ b/src/frame/tests/subsampled_4_2_2_high_bit.rs @@ -113,3 +113,87 @@ fn yuv422p16_try_new_checked_accepts_full_u16_range() { Yuv422p16Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8) .expect("every u16 value is in range at 16 bits"); } + +// ---- Host-independent BE-host regressions (codex round-2) ----------- +// +// Build planes from LE-encoded bytes via `to_le_bytes` and read back +// via `from_ne_bytes`. On LE host the buffer matches the literal; on +// BE host every `u16` is byte-swapped. The validator must `from_le`- +// normalize before the range check on both hosts. See the comment at +// the bottom of `subsampled_4_2_0_high_bit.rs` for the full rationale. + +fn le_encoded_u16_buf(intended: &[u16]) -> std::vec::Vec { + let bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect() +} + +#[test] +fn yuv422p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:2:2: chroma is half-width × full-height. + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + Yuv422p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8) + .expect("LE-encoded valid yuv422p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuv422p12_try_new_checked_rejects_le_encoded_out_of_range_on_any_host() { + // After `from_le` normalization, the offending sample is 4096 + // (just above 12-bit max 4095). + let mut intended_y = std::vec![2048u16; 16 * 8]; + intended_y[5] = 4096; + let intended_uv = std::vec![2048u16; 8 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + let e = Yuv422p12Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 8, 8).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::Y, + value: 4096, + max_valid: 4095, + .. + } + )); +} + +#[test] +fn p210_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // P210 white = 1023 << 6 = 0xFFC0; LE bytes [0xC0, 0xFF]. + // 4:2:2 PnFrame422: chroma is half-width pairs × full-height ⇒ + // each UV row holds `width` u16 elements (= width/2 pairs × 2). + let intended_y = std::vec![0xFFC0u16; 16 * 8]; + let intended_uv = std::vec![0x8000u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + P210Frame::try_new_checked(&y, &uv, 16, 8, 16, 16) + .expect("LE-encoded valid P210 must be accepted on both LE and BE hosts"); +} + +#[test] +fn p210_try_new_checked_rejects_le_encoded_low_bits_on_any_host() { + // After `from_le` normalization, the logical 0x03FF has all six + // low bits set — `yuv422p10le`-style data wrongly handed to P210. + let mut intended_y = std::vec![0xFFC0u16; 16 * 8]; + intended_y[2 * 16 + 7] = 0x03FF; + let intended_uv = std::vec![0x8000u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + let e = P210Frame::try_new_checked(&y, &uv, 16, 8, 16, 16).unwrap_err(); + assert!(matches!( + e, + PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Y, + value: 0x03FF, + low_bits: 6, + .. + } + )); +} diff --git a/src/frame/tests/subsampled_4_4_4_high_bit.rs b/src/frame/tests/subsampled_4_4_4_high_bit.rs index a7e3386e..5e4f52c2 100644 --- a/src/frame/tests/subsampled_4_4_4_high_bit.rs +++ b/src/frame/tests/subsampled_4_4_4_high_bit.rs @@ -198,3 +198,117 @@ fn yuv440p12_try_new_checked_rejects_above_4095() { } )); } + +// ---- Host-independent BE-host regressions (codex round-2) ----------- +// +// See `subsampled_4_2_0_high_bit.rs` for the full rationale: build +// planes from LE-encoded bytes so on BE hosts the validator's +// `u16::from_le` normalization is exercised end-to-end. + +fn le_encoded_u16_buf(intended: &[u16]) -> std::vec::Vec { + let bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect() +} + +#[test] +fn yuv444p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:4:4: chroma is full-width × full-height. + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + Yuv444p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 16, 16) + .expect("LE-encoded valid yuv444p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuv444p14_try_new_checked_rejects_le_encoded_out_of_range_on_any_host() { + // After `from_le` normalization, the offending sample is 16384 + // (just above 14-bit max 16383). + let intended_y = std::vec![8192u16; 16 * 8]; + let intended_u = std::vec![8192u16; 16 * 8]; + let mut intended_v = std::vec![8192u16; 16 * 8]; + intended_v[3 * 16 + 11] = 16384; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_u); + let v = le_encoded_u16_buf(&intended_v); + let e = Yuv444p14Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 16, 16).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::V, + value: 16384, + max_valid: 16383, + .. + } + )); +} + +#[test] +fn yuv440p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:4:0: chroma is full-width × half-height. + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 16 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + Yuv440p10Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 16, 16) + .expect("LE-encoded valid yuv440p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuv440p12_try_new_checked_rejects_le_encoded_out_of_range_on_any_host() { + let intended_y = std::vec![2048u16; 16 * 8]; + let mut intended_u = std::vec![2048u16; 16 * 4]; + intended_u[2 * 16 + 5] = 4096; + let intended_v = std::vec![2048u16; 16 * 4]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_u); + let v = le_encoded_u16_buf(&intended_v); + let e = Yuv440p12Frame::try_new_checked(&y, &u, &v, 16, 8, 16, 16, 16).unwrap_err(); + assert!(matches!( + e, + Yuv420pFrame16Error::SampleOutOfRange { + plane: Yuv420pFrame16Plane::U, + value: 4096, + max_valid: 4095, + .. + } + )); +} + +#[test] +fn p410_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:4:4 PnFrame444: chroma is full-width × full-height with 2 u16 + // per pair ⇒ each UV row holds 2 * width u16 elements. + let intended_y = std::vec![0xFFC0u16; 16 * 8]; + let intended_uv = std::vec![0x8000u16; 32 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + P410Frame::try_new_checked(&y, &uv, 16, 8, 16, 32) + .expect("LE-encoded valid P410 must be accepted on both LE and BE hosts"); +} + +#[test] +fn p410_try_new_checked_rejects_le_encoded_low_bits_on_any_host() { + // Logical 0x03FF (low 6 bits all set) on the UV plane. + let intended_y = std::vec![0xFFC0u16; 16 * 8]; + let mut intended_uv = std::vec![0x8000u16; 32 * 8]; + intended_uv[4 * 32 + 17] = 0x03FF; + let y = le_encoded_u16_buf(&intended_y); + let uv = le_encoded_u16_buf(&intended_uv); + let e = P410Frame::try_new_checked(&y, &uv, 16, 8, 16, 32).unwrap_err(); + assert!(matches!( + e, + PnFrameError::SampleLowBitsSet { + plane: PnFramePlane::Uv, + value: 0x03FF, + low_bits: 6, + .. + } + )); +} From 1c2df3d9818b3dcbfa9141171b2d116245c4b442 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 14:20:41 +1200 Subject: [PATCH 6/8] feat(be-yuv-hb): wire endian-aware row dispatch for high-bit YUV planar + Pn families MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-3 found that the BE row kernels added in this PR are not reachable from the public row dispatch API for the high-bit YUV planar (yuv420p9/10/12/14/16, yuv444p9/10/12/14/16) and Pn (P010/012/016, P410/412/416) families: the existing dispatcher signatures hard-wire `BE = false` and have no endian parameter, mirroring neither the Y210/V210 pattern nor the Tier-8 packed-RGB-16 pattern. BE input fed through these dispatchers was silently decoded as LE. This commit adds endian-aware dispatch entry points across all 17 high-bit row dispatcher files, matching the established pattern in `src/row/dispatch/{y210,y212,y216,v210,v410,xv36,ayuv64}.rs`: * For each `_to__row(...)` public dispatcher, add a new `_to__row_endian(..., big_endian: bool)` variant that routes `BE=true` through the scalar fallback and every SIMD backend (NEON / SSE4.1 / AVX2 / AVX-512BW / wasm-simd128) via the const-generic `<..., true>` path. The original LE-only function is kept as a backwards-compat one-line wrapper that calls `_endian(..., false)`. * Crate-private BITS-generic helpers `dispatch::yuv444::yuv_444p_n_to_rgb_row` / `..._to_rgb_u16_row` and `dispatch::pn::p_n_444_to_rgb_row` / `..._to_rgb_u16_row` now also carry a `` generic; the per-bit thin wrappers (yuv444p9/10/12/14 RGB and the P410/P412 RGB / RGB-u16 paths) gain matching `_endian` thin wrappers that pick `` or ``. New `_endian` entry points (64 total): * yuv420p{9,10,12,14,16}: 5 files × {rgb, rgb_u16, rgba, rgba_u16} = 20 * yuv444p{9,10,12,14,16}: 5 files × 4 = 20 * p010 / p012 / p016: 3 files × 4 = 12 * pn.rs (P410 / P412 / P416): 3 formats × {rgb, rgb_u16, rgba, rgba_u16} = 12 Sinker call sites in `src/sinker/mixed/**` are deliberately unchanged — per the established Frame-contract convention from PR #92, direct Frame-driven row dispatch uses the LE-encoded byte contract. Sinkers keep calling the LE wrapper (or equivalently `_endian(..., false)`), so back-compat is preserved. Adds dispatch-level BE/LE parity tests in `src/row/dispatch/be_yuv_hb_parity_tests.rs` plus an inline test in `yuv420/yuv420p10.rs`. Each test: * Builds a host-native u16 fixture, serializes via `to_le_bytes` / `to_be_bytes`, then reinterprets the byte streams via `from_ne_bytes`. The resulting LE / BE buffers exercise the dispatcher's `BE=false` and `BE=true` paths regardless of host endianness (no `cfg(target_endian)` gate — mirrors PR #82 `8f2e329` and `cbedaf1`). * Asserts byte-identical output between `_endian(LE_buf, false)` and `_endian(BE_buf, true)` for both `use_simd = false` and `use_simd = true`, so SIMD backends are covered when the host CPU supports them. * Coverage: yuv420p10, yuv444p10, p010, p410 (one per family minimum per the spec) plus yuv420p16, p016, yuv444p16, p416 (16-bit i64-chroma family). 8 new tests; all pass on aarch64-apple-darwin alongside the pre-existing 2346 lib tests. Verified: * `cargo test --target aarch64-apple-darwin --lib` — 2354 passed * `cargo build --target x86_64-apple-darwin --tests` — 0 warnings * `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` — clean * `cargo build --no-default-features` — clean * `cargo fmt --check` — clean * `cargo clippy --all-targets --all-features -- -D warnings` — clean * `cargo check --target s390x-unknown-linux-gnu --lib` — clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/be_yuv_hb_parity_tests.rs | 724 +++++++++++++++++++++ src/row/dispatch/mod.rs | 9 + src/row/dispatch/pn.rs | 670 ++++++++++++++----- src/row/dispatch/yuv420/p010.rs | 272 ++++++-- src/row/dispatch/yuv420/p012.rs | 272 ++++++-- src/row/dispatch/yuv420/p016.rs | 268 ++++++-- src/row/dispatch/yuv420/yuv420p10.rs | 526 ++++++++++++--- src/row/dispatch/yuv420/yuv420p12.rs | 356 +++++++--- src/row/dispatch/yuv420/yuv420p14.rs | 356 +++++++--- src/row/dispatch/yuv420/yuv420p16.rs | 280 ++++++-- src/row/dispatch/yuv420/yuv420p9.rs | 354 +++++++--- src/row/dispatch/yuv444/mod.rs | 28 +- src/row/dispatch/yuv444/yuv444p10.rs | 198 ++++-- src/row/dispatch/yuv444/yuv444p12.rs | 184 +++++- src/row/dispatch/yuv444/yuv444p14.rs | 184 +++++- src/row/dispatch/yuv444/yuv444p16.rs | 268 ++++++-- src/row/dispatch/yuv444/yuv444p9.rs | 184 +++++- src/row/mod.rs | 2 +- 18 files changed, 4118 insertions(+), 1017 deletions(-) create mode 100644 src/row/dispatch/be_yuv_hb_parity_tests.rs diff --git a/src/row/dispatch/be_yuv_hb_parity_tests.rs b/src/row/dispatch/be_yuv_hb_parity_tests.rs new file mode 100644 index 00000000..02257ad7 --- /dev/null +++ b/src/row/dispatch/be_yuv_hb_parity_tests.rs @@ -0,0 +1,724 @@ +//! Dispatch-level BE/LE parity tests for the high-bit YUV planar and +//! P-format row dispatchers (codex round-3 follow-up on +//! `feat/be-yuv-hb`). +//! +//! Each test asserts that `_to__row_endian(.., true)` +//! on a BE-encoded fixture produces byte-identical output to +//! `_to__row_endian(.., false)` on the corresponding +//! LE-encoded fixture. Fixtures are built byte-wise via +//! `to_le_bytes` / `to_be_bytes` and reinterpreted with `from_ne_bytes`, +//! so the test is host-independent — the LE buffer / `BE=false` pair +//! exercises the no-swap kernel path while the BE buffer / `BE=true` +//! pair exercises the swap path, regardless of whether the host is LE +//! or BE. +//! +//! Mirrors PR #82 `8f2e329` and the per-arch BE/LE parity tests. +//! Tests run with SIMD active where the host CPU supports it; the +//! `#[cfg_attr(miri, ignore)]` guard avoids exercising SIMD intrinsics +//! under Miri. +//! +//! Coverage (one representative per family): +//! - `yuv420p10` — already covered inline in `dispatch/yuv420/yuv420p10.rs` +//! - `yuv444p10` — full-width planar, BITS-generic helper path +//! - `p010` — 4:2:0 P-format, low-packed scalar / SIMD kernels +//! - `p410` — 4:4:4 P-format, BITS-generic helper path +//! - `yuv420p16` / `p016` / `yuv444p16` / `p416` — dedicated 16-bit +//! kernels (i64 chroma multiply path). + +use crate::{ColorMatrix, row::*}; + +/// Build LE / BE host-native u16 buffers from a slice of intended u16 +/// samples. Returns `(le, be)` where each slice contains `u16` elements +/// such that `to_ne_bytes` reproduces the LE/BE wire bytes for the +/// intended values. Identical pattern to the per-arch fixtures (see +/// `src/row/arch/*/tests/ayuv64.rs`). +fn split_le_be(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) +} + +fn pseudo_plane(len: usize, seed: u32, mask: u16) -> std::vec::Vec { + (0..len) + .map(|i| ((seed.wrapping_mul(i as u32 + 1).wrapping_add(0x55_u32)) & mask as u32) as u16) + .collect() +} + +fn pseudo_uv_interleaved(half_pairs: usize, seed: u32, mask: u16) -> std::vec::Vec { + (0..half_pairs * 2) + .map(|i| ((seed.wrapping_mul(i as u32 + 7).wrapping_add(0x123_u32)) & mask as u32) as u16) + .collect() +} + +// ---- yuv444p10 dispatch parity ------------------------------------------ + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p10_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + let y_int = pseudo_plane(w, 0x111, 0x3FF); + let u_int = pseudo_plane(w, 0x222, 0x3FF); + let v_int = pseudo_plane(w, 0x333, 0x3FF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + + for &use_simd in &[false, true] { + // u8 RGB — exercises BITS-generic `yuv_444p_n_to_rgb_row<10, BE>`. + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + yuv444p10_to_rgb_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p10_to_rgb_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le, out_be, + "yuv444p10 rgb BE/LE parity (w={w}, simd={use_simd})" + ); + + // u16 RGB + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + yuv444p10_to_rgb_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p10_to_rgb_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuv444p10 rgb_u16 BE/LE parity"); + + // u8 RGBA + let mut out_le4 = std::vec![0u8; w * 4]; + let mut out_be4 = std::vec![0u8; w * 4]; + yuv444p10_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le4, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p10_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be4, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4, out_be4, "yuv444p10 rgba BE/LE parity"); + + // u16 RGBA + let mut out_le4u = std::vec![0u16; w * 4]; + let mut out_be4u = std::vec![0u16; w * 4]; + yuv444p10_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p10_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4u, out_be4u, "yuv444p10 rgba_u16 BE/LE parity"); + } + } +} + +// ---- p010 dispatch parity (semi-planar 4:2:0, 10-bit high-packed) ------ + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p010_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + // P010 stores active bits in the high 10 of each u16 (sample << 6), + // so build samples already shifted into MSB-aligned form. + let y_int: std::vec::Vec = pseudo_plane(w, 0x440, 0x3FF) + .into_iter() + .map(|v| v << 6) + .collect(); + let uv_int: std::vec::Vec = pseudo_uv_interleaved(w / 2, 0x55C, 0x3FF) + .into_iter() + .map(|v| v << 6) + .collect(); + let (y_le, y_be) = split_le_be(&y_int); + let (uv_le, uv_be) = split_le_be(&uv_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + p010_to_rgb_row_endian( + &y_le, + &uv_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p010_to_rgb_row_endian( + &y_be, + &uv_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "p010 rgb BE/LE parity (w={w})"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + p010_to_rgb_u16_row_endian( + &y_le, + &uv_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p010_to_rgb_u16_row_endian( + &y_be, + &uv_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "p010 rgb_u16 BE/LE parity"); + + let mut out_le4 = std::vec![0u8; w * 4]; + let mut out_be4 = std::vec![0u8; w * 4]; + p010_to_rgba_row_endian( + &y_le, + &uv_le, + &mut out_le4, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p010_to_rgba_row_endian( + &y_be, + &uv_be, + &mut out_be4, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4, out_be4, "p010 rgba BE/LE parity"); + + let mut out_le4u = std::vec![0u16; w * 4]; + let mut out_be4u = std::vec![0u16; w * 4]; + p010_to_rgba_u16_row_endian( + &y_le, + &uv_le, + &mut out_le4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p010_to_rgba_u16_row_endian( + &y_be, + &uv_be, + &mut out_be4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4u, out_be4u, "p010 rgba_u16 BE/LE parity"); + } + } +} + +// ---- p410 dispatch parity (semi-planar 4:4:4, 10-bit high-packed) ------ + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p410_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + let y_int: std::vec::Vec = pseudo_plane(w, 0x710, 0x3FF) + .into_iter() + .map(|v| v << 6) + .collect(); + // P4xx UV is full-width interleaved (one (U,V) pair per pixel). + let uv_int: std::vec::Vec = pseudo_uv_interleaved(w, 0x842, 0x3FF) + .into_iter() + .map(|v| v << 6) + .collect(); + let (y_le, y_be) = split_le_be(&y_int); + let (uv_le, uv_be) = split_le_be(&uv_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + p410_to_rgb_row_endian( + &y_le, + &uv_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p410_to_rgb_row_endian( + &y_be, + &uv_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "p410 rgb BE/LE parity (w={w})"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + p410_to_rgb_u16_row_endian( + &y_le, + &uv_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p410_to_rgb_u16_row_endian( + &y_be, + &uv_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "p410 rgb_u16 BE/LE parity"); + + let mut out_le4 = std::vec![0u8; w * 4]; + let mut out_be4 = std::vec![0u8; w * 4]; + p410_to_rgba_row_endian( + &y_le, + &uv_le, + &mut out_le4, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p410_to_rgba_row_endian( + &y_be, + &uv_be, + &mut out_be4, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4, out_be4, "p410 rgba BE/LE parity"); + + let mut out_le4u = std::vec![0u16; w * 4]; + let mut out_be4u = std::vec![0u16; w * 4]; + p410_to_rgba_u16_row_endian( + &y_le, + &uv_le, + &mut out_le4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p410_to_rgba_u16_row_endian( + &y_be, + &uv_be, + &mut out_be4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4u, out_be4u, "p410 rgba_u16 BE/LE parity"); + } + } +} + +// ---- 16-bit families: yuv420p16 / p016 / yuv444p16 / p416 -------------- + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv420p16_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0xAAAA, 0xFFFF); + let u_int = pseudo_plane(w / 2, 0xBBBB, 0xFFFF); + let v_int = pseudo_plane(w / 2, 0xCCCC, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + yuv420p16_to_rgb_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p16_to_rgb_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "yuv420p16 rgb BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + yuv420p16_to_rgb_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p16_to_rgb_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuv420p16 rgb_u16 BE/LE parity"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p016_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0xD0D0, 0xFFFF); + let uv_int = pseudo_uv_interleaved(w / 2, 0xE0E0, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (uv_le, uv_be) = split_le_be(&uv_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + p016_to_rgb_row_endian( + &y_le, + &uv_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p016_to_rgb_row_endian( + &y_be, + &uv_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "p016 rgb BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + p016_to_rgb_u16_row_endian( + &y_le, + &uv_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p016_to_rgb_u16_row_endian( + &y_be, + &uv_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "p016 rgb_u16 BE/LE parity"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuv444p16_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0x4444, 0xFFFF); + let u_int = pseudo_plane(w, 0x5555, 0xFFFF); + let v_int = pseudo_plane(w, 0x6666, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + yuv444p16_to_rgb_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p16_to_rgb_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "yuv444p16 rgb BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + yuv444p16_to_rgb_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv444p16_to_rgb_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuv444p16 rgb_u16 BE/LE parity"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn p416_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0x7070, 0xFFFF); + let uv_int = pseudo_uv_interleaved(w, 0x8080, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (uv_le, uv_be) = split_le_be(&uv_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + p416_to_rgb_row_endian( + &y_le, + &uv_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p416_to_rgb_row_endian( + &y_be, + &uv_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "p416 rgb BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + p416_to_rgb_u16_row_endian( + &y_le, + &uv_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p416_to_rgb_u16_row_endian( + &y_be, + &uv_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "p416 rgb_u16 BE/LE parity"); + + // Also exercise the i64 chroma RGBA path. + let mut out_le4u = std::vec![0u16; w * 4]; + let mut out_be4u = std::vec![0u16; w * 4]; + p416_to_rgba_u16_row_endian( + &y_le, + &uv_le, + &mut out_le4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + p416_to_rgba_u16_row_endian( + &y_be, + &uv_be, + &mut out_be4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4u, out_be4u, "p416 rgba_u16 BE/LE parity"); + } +} diff --git a/src/row/dispatch/mod.rs b/src/row/dispatch/mod.rs index c1ae8ccb..9e60b3be 100644 --- a/src/row/dispatch/mod.rs +++ b/src/row/dispatch/mod.rs @@ -42,3 +42,12 @@ pub(super) mod ya8; pub(super) mod yuv420; pub(super) mod yuv444; pub(super) mod yuva; + +// Dispatch-level BE/LE parity tests for the high-bit YUV planar and +// P-format families (codex round-3 follow-up on `feat/be-yuv-hb`). The +// per-format dispatchers in `yuv420::*`, `yuv444::*`, and `pn::*` each +// gained `_endian` entry points; this module asserts that the BE path +// is reachable and produces byte-identical output to the LE path on +// matching fixtures. +#[cfg(all(test, feature = "std"))] +mod be_yuv_hb_parity_tests; diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs index 1f8b7526..f0f75646 100644 --- a/src/row/dispatch/pn.rs +++ b/src/row/dispatch/pn.rs @@ -47,7 +47,7 @@ use crate::{ /// `BITS` to a literal. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn p_n_444_to_rgb_row( +pub(crate) fn p_n_444_to_rgb_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -68,7 +68,7 @@ pub(crate) fn p_n_444_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -77,21 +77,21 @@ pub(crate) fn p_n_444_to_rgb_row( if avx512_available() { // SAFETY: AVX-512BW verified. unsafe { - arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -100,7 +100,7 @@ pub(crate) fn p_n_444_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile-time verified. unsafe { - arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -109,7 +109,7 @@ pub(crate) fn p_n_444_to_rgb_row( } } - scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16** @@ -118,7 +118,7 @@ pub(crate) fn p_n_444_to_rgb_row( /// [`p_n_444_to_rgb_row`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn p_n_444_to_rgb_u16_row( +pub(crate) fn p_n_444_to_rgb_u16_row( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -139,7 +139,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -147,19 +147,19 @@ pub(crate) fn p_n_444_to_rgb_u16_row( target_arch = "x86_64" => { if avx512_available() { unsafe { - arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if avx2_available() { unsafe { - arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } if sse41_available() { unsafe { - arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -167,7 +167,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( target_arch = "wasm32" => { if simd128_available() { unsafe { - arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } return; } @@ -176,7 +176,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( } } - scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } /// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher. @@ -186,7 +186,7 @@ pub(crate) fn p_n_444_to_rgb_u16_row( /// pinned to BITS ∈ {10, 12}. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p416_to_rgb_row( +pub fn p416_to_rgb_row_endian( y: &[u16], uv_full: &[u16], rgb_out: &mut [u8], @@ -194,6 +194,7 @@ pub fn p416_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgb_min = rgb_row_bytes(width); let uv_min = uv_full_row_elems(width); @@ -201,42 +202,53 @@ pub fn p416_to_rgb_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -244,7 +256,28 @@ pub fn p416_to_rgb_row( } } - scalar::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range), + scalar::p_n_444_16_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p416_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgb_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p416_to_rgb_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma @@ -252,7 +285,7 @@ pub fn p416_to_rgb_row( /// see scalar reference for the rationale. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p416_to_rgb_u16_row( +pub fn p416_to_rgb_u16_row_endian( y: &[u16], uv_full: &[u16], rgb_out: &mut [u16], @@ -260,6 +293,7 @@ pub fn p416_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgb_min = rgb_row_elems(width); let uv_min = uv_full_row_elems(width); @@ -267,41 +301,52 @@ pub fn p416_to_rgb_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -309,10 +354,51 @@ pub fn p416_to_rgb_u16_row( } } - scalar::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range), + scalar::p_n_444_16_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p416_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgb_u16_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p416_to_rgb_u16_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); +} + +/// P410 → packed u8 RGB. Endian-aware thin wrapper at `BITS = 10`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgb_row_endian( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + p_n_444_to_rgb_row::<10, true>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } else { + p_n_444_to_rgb_row::<10, false>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } } -/// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`. +/// LE-only wrapper around [`p410_to_rgb_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p410_to_rgb_row( @@ -324,10 +410,33 @@ pub fn p410_to_rgb_row( full_range: bool, use_simd: bool, ) { - p_n_444_to_rgb_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + p410_to_rgb_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); } -/// P410 → native-depth u16 RGB (10-bit low-packed output). +/// P410 → native-depth u16 RGB (10-bit low-packed output). Endian-aware +/// thin wrapper. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgb_u16_row_endian( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + p_n_444_to_rgb_u16_row::<10, true>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } else { + p_n_444_to_rgb_u16_row::<10, false>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`p410_to_rgb_u16_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p410_to_rgb_u16_row( @@ -339,10 +448,32 @@ pub fn p410_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - p_n_444_to_rgb_u16_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + p410_to_rgb_u16_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); } -/// P412 → packed u8 RGB. Thin wrapper at `BITS = 12`. +/// P412 → packed u8 RGB. Endian-aware thin wrapper at `BITS = 12`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgb_row_endian( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + p_n_444_to_rgb_row::<12, true>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } else { + p_n_444_to_rgb_row::<12, false>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`p412_to_rgb_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p412_to_rgb_row( @@ -354,10 +485,33 @@ pub fn p412_to_rgb_row( full_range: bool, use_simd: bool, ) { - p_n_444_to_rgb_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + p412_to_rgb_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); } -/// P412 → native-depth u16 RGB (12-bit low-packed output). +/// P412 → native-depth u16 RGB (12-bit low-packed output). Endian-aware +/// thin wrapper. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgb_u16_row_endian( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + p_n_444_to_rgb_u16_row::<12, true>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } else { + p_n_444_to_rgb_u16_row::<12, false>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`p412_to_rgb_u16_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn p412_to_rgb_u16_row( @@ -369,7 +523,9 @@ pub fn p412_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); + p412_to_rgb_u16_row_endian( + y, uv_full, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit** @@ -378,7 +534,7 @@ pub fn p412_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p410_to_rgba_row( +pub fn p410_to_rgba_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -386,6 +542,7 @@ pub fn p410_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); let uv_min = uv_full_row_elems(width); @@ -393,46 +550,57 @@ pub fn p410_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -440,7 +608,28 @@ pub fn p410_to_rgba_row( } } - scalar::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_to_rgba_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_to_rgba_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p410_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p410_to_rgba_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// P410 → **native-depth `u16`** packed **RGBA** — output is @@ -449,7 +638,7 @@ pub fn p410_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p410_to_rgba_u16_row( +pub fn p410_to_rgba_u16_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -457,6 +646,7 @@ pub fn p410_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); let uv_min = uv_full_row_elems(width); @@ -464,46 +654,57 @@ pub fn p410_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -511,7 +712,28 @@ pub fn p410_to_rgba_u16_row( } } - scalar::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_to_rgba_u16_row::<10, false>(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_to_rgba_u16_row::<10, true>(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p410_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p410_to_rgba_u16_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit** @@ -520,7 +742,7 @@ pub fn p410_to_rgba_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p412_to_rgba_row( +pub fn p412_to_rgba_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -528,6 +750,7 @@ pub fn p412_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); let uv_min = uv_full_row_elems(width); @@ -535,46 +758,57 @@ pub fn p412_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -582,7 +816,28 @@ pub fn p412_to_rgba_row( } } - scalar::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_to_rgba_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_to_rgba_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p412_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p412_to_rgba_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// P412 → **native-depth `u16`** packed **RGBA** — output is @@ -591,7 +846,7 @@ pub fn p412_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p412_to_rgba_u16_row( +pub fn p412_to_rgba_u16_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -599,6 +854,7 @@ pub fn p412_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); let uv_min = uv_full_row_elems(width); @@ -606,46 +862,57 @@ pub fn p412_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -653,7 +920,28 @@ pub fn p412_to_rgba_u16_row( } } - scalar::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_to_rgba_u16_row::<12, false>(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_to_rgba_u16_row::<12, true>(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p412_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p412_to_rgba_u16_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA** @@ -663,7 +951,7 @@ pub fn p412_to_rgba_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p416_to_rgba_row( +pub fn p416_to_rgba_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u8], @@ -671,6 +959,7 @@ pub fn p416_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); let uv_min = uv_full_row_elems(width); @@ -678,46 +967,57 @@ pub fn p416_to_rgba_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -725,7 +1025,28 @@ pub fn p416_to_rgba_row( } } - scalar::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_16_to_rgba_row::(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p416_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p416_to_rgba_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// P416 → **native-depth `u16`** packed **RGBA** — full-range output @@ -736,7 +1057,7 @@ pub fn p416_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p416_to_rgba_u16_row( +pub fn p416_to_rgba_u16_row_endian( y: &[u16], uv_full: &[u16], rgba_out: &mut [u16], @@ -744,6 +1065,7 @@ pub fn p416_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); let uv_min = uv_full_row_elems(width); @@ -751,46 +1073,57 @@ pub fn p416_to_rgba_u16_row( assert!(uv_full.len() >= uv_min, "uv_full row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -798,5 +1131,26 @@ pub fn p416_to_rgba_u16_row( } } - scalar::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range), + scalar::p_n_444_16_to_rgba_u16_row::(y, uv_full, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p416_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p416_to_rgba_u16_row_endian( + y, uv_full, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs index e9912e75..c61ede84 100644 --- a/src/row/dispatch/yuv420/p010.rs +++ b/src/row/dispatch/yuv420/p010.rs @@ -28,7 +28,7 @@ use crate::{ /// specification. `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_row( +pub fn p010_to_rgb_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -36,6 +36,7 @@ pub fn p010_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P010 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -43,46 +44,57 @@ pub fn p010_to_rgb_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -90,7 +102,28 @@ pub fn p010_to_rgb_row( } } - scalar::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgb_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p_n_to_rgb_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p010_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p010_to_rgb_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P010** to **native‑depth `u16`** packed RGB @@ -103,7 +136,7 @@ pub fn p010_to_rgb_row( /// `use_simd = false` forces the scalar reference. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_u16_row( +pub fn p010_to_rgb_u16_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -111,6 +144,7 @@ pub fn p010_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P010 requires even width"); let rgb_min = rgb_row_elems(width); @@ -118,48 +152,61 @@ pub fn p010_to_rgb_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgb_u16_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgb_u16_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgb_u16_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgb_u16_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgb_u16_row::<10, false>( y, uv_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::p_n_to_rgb_u16_row::<10, true>( + y, uv_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -167,7 +214,28 @@ pub fn p010_to_rgb_u16_row( } } - scalar::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgb_u16_row::<10, false>(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p_n_to_rgb_u16_row::<10, true>(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p010_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p010_to_rgb_u16_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, @@ -178,7 +246,7 @@ pub fn p010_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_row( +pub fn p010_to_rgba_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -186,6 +254,7 @@ pub fn p010_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -193,46 +262,57 @@ pub fn p010_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -240,7 +320,28 @@ pub fn p010_to_rgba_row( } } - scalar::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgba_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p_n_to_rgba_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p010_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p010_to_rgba_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, @@ -251,7 +352,7 @@ pub fn p010_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_u16_row( +pub fn p010_to_rgba_u16_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -259,6 +360,7 @@ pub fn p010_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -266,46 +368,57 @@ pub fn p010_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -313,5 +426,26 @@ pub fn p010_to_rgba_u16_row( } } - scalar::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgba_u16_row::<10, false>(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p_n_to_rgba_u16_row::<10, true>(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p010_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p010_to_rgba_u16_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs index a91ac9b0..9c1d76a5 100644 --- a/src/row/dispatch/yuv420/p012.rs +++ b/src/row/dispatch/yuv420/p012.rs @@ -27,7 +27,7 @@ use crate::{ /// `>> 4` instead of `>> 6` at each `u16` load. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_row( +pub fn p012_to_rgb_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -35,6 +35,7 @@ pub fn p012_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P012 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -42,41 +43,52 @@ pub fn p012_to_rgb_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -84,7 +96,28 @@ pub fn p012_to_rgb_row( } } - scalar::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgb_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p_n_to_rgb_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p012_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p012_to_rgb_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P012** to **native‑depth `u16`** packed RGB @@ -92,7 +125,7 @@ pub fn p012_to_rgb_row( /// `yuv420p12le` convention, **not** P012's high‑bit packing). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_u16_row( +pub fn p012_to_rgb_u16_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -100,6 +133,7 @@ pub fn p012_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P012 requires even width"); let rgb_min = rgb_row_elems(width); @@ -107,43 +141,56 @@ pub fn p012_to_rgb_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgb_u16_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgb_u16_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgb_u16_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgb_u16_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgb_u16_row::<12, false>( y, uv_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::p_n_to_rgb_u16_row::<12, true>( + y, uv_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -151,7 +198,28 @@ pub fn p012_to_rgb_u16_row( } } - scalar::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgb_u16_row::<12, false>(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p_n_to_rgb_u16_row::<12, true>(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p012_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p012_to_rgb_u16_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, @@ -162,7 +230,7 @@ pub fn p012_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_row( +pub fn p012_to_rgba_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -170,6 +238,7 @@ pub fn p012_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -177,46 +246,57 @@ pub fn p012_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -224,7 +304,28 @@ pub fn p012_to_rgba_row( } } - scalar::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgba_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p_n_to_rgba_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p012_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p012_to_rgba_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, @@ -235,7 +336,7 @@ pub fn p012_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_u16_row( +pub fn p012_to_rgba_u16_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -243,6 +344,7 @@ pub fn p012_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -250,46 +352,57 @@ pub fn p012_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -297,5 +410,26 @@ pub fn p012_to_rgba_u16_row( } } - scalar::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p_n_to_rgba_u16_row::<12, false>(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p_n_to_rgba_u16_row::<12, true>(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p012_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p012_to_rgba_u16_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs index d21bc556..97cd2e53 100644 --- a/src/row/dispatch/yuv420/p016.rs +++ b/src/row/dispatch/yuv420/p016.rs @@ -22,7 +22,7 @@ use crate::{ /// vs. low-bit-packed distinction (all bits are active). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_row( +pub fn p016_to_rgb_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u8], @@ -30,6 +30,7 @@ pub fn p016_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P016 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -37,41 +38,52 @@ pub fn p016_to_rgb_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -79,14 +91,35 @@ pub fn p016_to_rgb_row( } } - scalar::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p16_to_rgb_row::(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p016_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p016_to_rgb_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P016** to **native-depth `u16`** packed RGB /// (full-range output in `[0, 65535]`). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_u16_row( +pub fn p016_to_rgb_u16_row_endian( y: &[u16], uv_half: &[u16], rgb_out: &mut [u16], @@ -94,6 +127,7 @@ pub fn p016_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "P016 requires even width"); let rgb_min = rgb_row_elems(width); @@ -101,41 +135,52 @@ pub fn p016_to_rgb_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -143,7 +188,28 @@ pub fn p016_to_rgb_u16_row( } } - scalar::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range), + scalar::p16_to_rgb_u16_row::(y, uv_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p016_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p016_to_rgb_u16_row_endian( + y, uv_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit @@ -154,7 +220,7 @@ pub fn p016_to_rgb_u16_row( /// reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_row( +pub fn p016_to_rgba_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u8], @@ -162,6 +228,7 @@ pub fn p016_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -169,41 +236,52 @@ pub fn p016_to_rgba_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -211,7 +289,28 @@ pub fn p016_to_rgba_row( } } - scalar::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p16_to_rgba_row::(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p016_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p016_to_rgba_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **P016** to **native-depth `u16`** packed @@ -223,7 +322,7 @@ pub fn p016_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_u16_row( +pub fn p016_to_rgba_u16_row_endian( y: &[u16], uv_half: &[u16], rgba_out: &mut [u16], @@ -231,6 +330,7 @@ pub fn p016_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -238,41 +338,52 @@ pub fn p016_to_rgba_u16_row( assert!(uv_half.len() >= width, "uv_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -280,5 +391,26 @@ pub fn p016_to_rgba_u16_row( } } - scalar::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range), + scalar::p16_to_rgba_u16_row::(y, uv_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`p016_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p016_to_rgba_u16_row_endian( + y, uv_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 8088e8c9..2204feb7 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -29,7 +29,7 @@ use crate::{ /// path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_row( +pub fn yuv420p10_to_rgb_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -38,6 +38,7 @@ pub fn yuv420p10_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -46,55 +47,74 @@ pub fn yuv420p10_to_rgb_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<10, true>(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<10, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<10, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -102,7 +122,33 @@ pub fn yuv420p10_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<10, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p_n_to_rgb_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgb_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuv420p10_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p10_to_rgb_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed @@ -124,7 +170,7 @@ pub fn yuv420p10_to_rgb_row( /// path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_u16_row( +pub fn yuv420p10_to_rgb_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -133,6 +179,7 @@ pub fn yuv420p10_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_elems(width); @@ -141,56 +188,77 @@ pub fn yuv420p10_to_rgb_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -198,8 +266,32 @@ pub fn yuv420p10_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( - y, u_half, v_half, rgb_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgb_u16_row::<10, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgb_u16_row::<10, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p10_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p10_to_rgb_u16_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, ); } @@ -214,7 +306,7 @@ pub fn yuv420p10_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_row( +pub fn yuv420p10_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -223,6 +315,7 @@ pub fn yuv420p10_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -231,54 +324,73 @@ pub fn yuv420p10_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<10, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<10, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<10, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<10, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, @@ -286,8 +398,32 @@ pub fn yuv420p10_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<10, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_row::<10, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p10_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p10_to_rgba_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } @@ -300,7 +436,7 @@ pub fn yuv420p10_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_u16_row( +pub fn yuv420p10_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -309,6 +445,7 @@ pub fn yuv420p10_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -317,54 +454,73 @@ pub fn yuv420p10_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<10, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<10, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -372,7 +528,201 @@ pub fn yuv420p10_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<10, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_row::<10, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_u16_row::<10, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p10_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p10_to_rgba_u16_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } + +#[cfg(all(test, feature = "std"))] +// Host-independent BE parity tests — fixtures built byte-wise via +// `to_le_bytes` / `to_be_bytes` and reinterpreted with `from_ne_bytes`, +// so the LE and BE variants exercise the dispatcher's `BE=false` and +// `BE=true` paths regardless of host endianness. Mirrors PR #82 +// `8f2e329` and the per-arch test convention. +mod be_parity_tests { + use super::*; + + /// Build LE/BE host-native u16 buffers from a slice of intended 10-bit + /// samples (values must already be in the correct low-bit-packed form + /// — `[0, 1023]`). Returns `(le, be)` where each contains `u16` + /// elements that, when serialized via `to_ne_bytes`, reproduce the + /// LE/BE wire bytes. + fn split_le_be(intended: &[u16]) -> (std::vec::Vec, std::vec::Vec) { + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + (le, be) + } + + fn pseudo_y(width: usize, seed: u32) -> std::vec::Vec { + (0..width) + .map(|i| ((seed.wrapping_mul(i as u32 + 1).wrapping_add(0x55)) & 0x3FF) as u16) + .collect() + } + fn pseudo_uv(half: usize, seed: u32) -> std::vec::Vec { + (0..half) + .map(|i| ((seed.wrapping_mul(i as u32 + 7).wrapping_add(0x123)) & 0x3FF) as u16) + .collect() + } + + #[test] + #[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" + )] + fn yuv420p10_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + let half = w / 2; + let y_int = pseudo_y(w, 0xA17F); + let u_int = pseudo_uv(half, 0xC0DE); + let v_int = pseudo_uv(half, 0xBEEF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + + for &use_simd in &[false, true] { + // u8 RGB + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + yuv420p10_to_rgb_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p10_to_rgb_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le, out_be, + "yuv420p10 rgb BE/LE parity (w={w}, simd={use_simd})" + ); + + // u16 RGB + let mut out_le16 = std::vec![0u16; w * 3]; + let mut out_be16 = std::vec![0u16; w * 3]; + yuv420p10_to_rgb_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p10_to_rgb_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuv420p10 rgb_u16 BE/LE parity"); + + // u8 RGBA + let mut out_le4 = std::vec![0u8; w * 4]; + let mut out_be4 = std::vec![0u8; w * 4]; + yuv420p10_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le4, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p10_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be4, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4, out_be4, "yuv420p10 rgba BE/LE parity"); + + // u16 RGBA + let mut out_le4u = std::vec![0u16; w * 4]; + let mut out_be4u = std::vec![0u16; w * 4]; + yuv420p10_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &mut out_le4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuv420p10_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &mut out_be4u, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le4u, out_be4u, "yuv420p10 rgba_u16 BE/LE parity"); + } + } + } +} diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index bf84f8ab..0b1a3ff3 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -25,7 +25,7 @@ use crate::{ /// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_row( +pub fn yuv420p12_to_rgb_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -34,6 +34,7 @@ pub fn yuv420p12_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -42,54 +43,73 @@ pub fn yuv420p12_to_rgb_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<12, true>(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<12, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<12, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -97,7 +117,33 @@ pub fn yuv420p12_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<12, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p_n_to_rgb_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgb_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuv420p12_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p12_to_rgb_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed @@ -105,7 +151,7 @@ pub fn yuv420p12_to_rgb_row( /// `yuv420p12le` convention — upper 4 bits zero). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_u16_row( +pub fn yuv420p12_to_rgb_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -114,6 +160,7 @@ pub fn yuv420p12_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_elems(width); @@ -122,51 +169,72 @@ pub fn yuv420p12_to_rgb_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -174,8 +242,32 @@ pub fn yuv420p12_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<12, false>( - y, u_half, v_half, rgb_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgb_u16_row::<12, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgb_u16_row::<12, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p12_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p12_to_rgb_u16_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, ); } @@ -190,7 +282,7 @@ pub fn yuv420p12_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_row( +pub fn yuv420p12_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -199,6 +291,7 @@ pub fn yuv420p12_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -207,54 +300,73 @@ pub fn yuv420p12_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<12, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<12, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<12, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<12, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -262,8 +374,32 @@ pub fn yuv420p12_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<12, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_row::<12, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p12_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p12_to_rgba_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } @@ -276,7 +412,7 @@ pub fn yuv420p12_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_u16_row( +pub fn yuv420p12_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -285,6 +421,7 @@ pub fn yuv420p12_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -293,54 +430,73 @@ pub fn yuv420p12_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<12, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<12, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, @@ -348,7 +504,31 @@ pub fn yuv420p12_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<12, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_row::<12, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_u16_row::<12, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p12_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p12_to_rgba_u16_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index e98c61ac..ef46ebd6 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -20,7 +20,7 @@ use crate::{ /// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_row( +pub fn yuv420p14_to_rgb_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -29,6 +29,7 @@ pub fn yuv420p14_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -37,49 +38,68 @@ pub fn yuv420p14_to_rgb_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<14, true>(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<14, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<14, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -87,14 +107,40 @@ pub fn yuv420p14_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<14, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p_n_to_rgb_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgb_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuv420p14_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p14_to_rgb_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed /// `u16` RGB (14‑bit values in the low 14 of each `u16`). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_u16_row( +pub fn yuv420p14_to_rgb_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -103,6 +149,7 @@ pub fn yuv420p14_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_elems(width); @@ -111,51 +158,72 @@ pub fn yuv420p14_to_rgb_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -163,8 +231,32 @@ pub fn yuv420p14_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<14, false>( - y, u_half, v_half, rgb_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgb_u16_row::<14, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgb_u16_row::<14, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p14_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p14_to_rgb_u16_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, ); } @@ -179,7 +271,7 @@ pub fn yuv420p14_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_row( +pub fn yuv420p14_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -188,6 +280,7 @@ pub fn yuv420p14_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -196,54 +289,73 @@ pub fn yuv420p14_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<14, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<14, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<14, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<14, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -251,8 +363,32 @@ pub fn yuv420p14_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<14, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_row::<14, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p14_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p14_to_rgba_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } @@ -265,7 +401,7 @@ pub fn yuv420p14_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_u16_row( +pub fn yuv420p14_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -274,6 +410,7 @@ pub fn yuv420p14_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -282,54 +419,73 @@ pub fn yuv420p14_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<14, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<14, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, @@ -337,7 +493,31 @@ pub fn yuv420p14_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<14, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_row::<14, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_u16_row::<14, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p14_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p14_to_rgba_u16_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index 1f4dfa9e..6e17b624 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -24,7 +24,7 @@ use crate::{ /// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_row( +pub fn yuv420p16_to_rgb_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -33,6 +33,7 @@ pub fn yuv420p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -41,41 +42,52 @@ pub fn yuv420p16_to_rgb_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -83,14 +95,36 @@ pub fn yuv420p16_to_rgb_row( } } - scalar::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range), + scalar::yuv_420p16_to_rgb_row::(y, u_half, v_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv420p16_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p16_to_rgb_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** /// packed `u16` RGB (full-range output in `[0, 65535]`). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_u16_row( +pub fn yuv420p16_to_rgb_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -99,6 +133,7 @@ pub fn yuv420p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_elems(width); @@ -107,41 +142,52 @@ pub fn yuv420p16_to_rgb_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -149,7 +195,33 @@ pub fn yuv420p16_to_rgb_u16_row( } } - scalar::yuv_420p16_to_rgb_u16_row::(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p16_to_rgb_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range + ), + scalar::yuv_420p16_to_rgb_u16_row::( + y, u_half, v_half, rgb_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuv420p16_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p16_to_rgb_u16_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** @@ -161,7 +233,7 @@ pub fn yuv420p16_to_rgb_u16_row( /// scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_row( +pub fn yuv420p16_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -170,6 +242,7 @@ pub fn yuv420p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -178,41 +251,52 @@ pub fn yuv420p16_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -220,7 +304,29 @@ pub fn yuv420p16_to_rgba_row( } } - scalar::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range), + scalar::yuv_420p16_to_rgba_row::(y, u_half, v_half, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv420p16_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p16_to_rgba_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** @@ -233,7 +339,7 @@ pub fn yuv420p16_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_u16_row( +pub fn yuv420p16_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -242,6 +348,7 @@ pub fn yuv420p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -250,41 +357,52 @@ pub fn yuv420p16_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_u16_row::(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -292,7 +410,31 @@ pub fn yuv420p16_to_rgba_u16_row( } } - scalar::yuv_420p16_to_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p16_to_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p16_to_rgba_u16_row::( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p16_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p16_to_rgba_u16_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index 3d770678..740b0315 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -32,7 +32,7 @@ use crate::{ /// path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_row( +pub fn yuv420p9_to_rgb_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -41,6 +41,7 @@ pub fn yuv420p9_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_bytes(width); @@ -49,54 +50,73 @@ pub fn yuv420p9_to_rgb_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_row::<9, true>(y, u_half, v_half, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_row::<9, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_row::<9, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_row::<9, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, @@ -104,14 +124,38 @@ pub fn yuv420p9_to_rgb_row( } } - scalar::yuv_420p_n_to_rgb_row::<9, false>(y, u_half, v_half, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_420p_n_to_rgb_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgb_row::<9, true>(y, u_half, v_half, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv420p9_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p9_to_rgb_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed /// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_u16_row( +pub fn yuv420p9_to_rgb_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -120,6 +164,7 @@ pub fn yuv420p9_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgb_min = rgb_row_elems(width); @@ -128,56 +173,77 @@ pub fn yuv420p9_to_rgb_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::neon::yuv_420p_n_to_rgb_u16_row::<9, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9, false>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9, true>( y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -185,8 +251,32 @@ pub fn yuv420p9_to_rgb_u16_row( } } - scalar::yuv_420p_n_to_rgb_u16_row::<9, false>( - y, u_half, v_half, rgb_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgb_u16_row::<9, false>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgb_u16_row::<9, true>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p9_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p9_to_rgb_u16_row_endian( + y, u_half, v_half, rgb_out, width, matrix, full_range, use_simd, false, ); } @@ -207,7 +297,7 @@ pub fn yuv420p9_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_row( +pub fn yuv420p9_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -216,6 +306,7 @@ pub fn yuv420p9_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -224,55 +315,74 @@ pub fn yuv420p9_to_rgba_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified on this CPU; bounds / parity are // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_row::<9, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_row::<9, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_row::<9, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_row::<9, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, @@ -280,8 +390,32 @@ pub fn yuv420p9_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_row::<9, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_row::<9, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p9_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p9_to_rgba_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } @@ -294,7 +428,7 @@ pub fn yuv420p9_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_u16_row( +pub fn yuv420p9_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -303,6 +437,7 @@ pub fn yuv420p9_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -311,54 +446,73 @@ pub fn yuv420p9_to_rgba_u16_row( assert!(v_half.len() >= width / 2, "v_half row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<9, false>(y, u_half, v_half, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_row::<9, true>(y, u_half, v_half, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9, true>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9, false>( + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9, false>( y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } + ); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); } + ); return; } }, @@ -366,7 +520,31 @@ pub fn yuv420p9_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_row::<9, false>( - y, u_half, v_half, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_row::<9, false>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ), + scalar::yuv_420p_n_to_rgba_u16_row::<9, true>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ) + ); +} + +/// LE-only wrapper around [`yuv420p9_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv420p9_to_rgba_u16_row_endian( + y, u_half, v_half, rgba_out, width, matrix, full_range, use_simd, false, ); } diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs index 00b78660..5d1444c6 100644 --- a/src/row/dispatch/yuv444/mod.rs +++ b/src/row/dispatch/yuv444/mod.rs @@ -45,7 +45,7 @@ use crate::{ /// keeping the `` generic internal. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_row( +pub(crate) fn yuv_444p_n_to_rgb_row( y: &[u16], u: &[u16], v: &[u16], @@ -67,7 +67,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -76,21 +76,21 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -99,7 +99,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -108,7 +108,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( } } - scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } /// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. @@ -122,7 +122,7 @@ pub(crate) fn yuv_444p_n_to_rgb_row( /// dedicated i64-chroma kernel family. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_u16_row( +pub(crate) fn yuv_444p_n_to_rgb_u16_row( y: &[u16], u: &[u16], v: &[u16], @@ -144,7 +144,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if neon_available() { // SAFETY: NEON verified. unsafe { - arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -153,21 +153,21 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if avx512_available() { // SAFETY: AVX‑512BW verified. unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if avx2_available() { // SAFETY: AVX2 verified. unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } if sse41_available() { // SAFETY: SSE4.1 verified. unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -176,7 +176,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( if simd128_available() { // SAFETY: simd128 compile‑time verified. unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } return; } @@ -185,7 +185,7 @@ pub(crate) fn yuv_444p_n_to_rgb_u16_row( } } - scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } pub(super) mod yuv444p10; diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs index 245c765a..0b3b1174 100644 --- a/src/row/dispatch/yuv444/yuv444p10.rs +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -19,8 +19,33 @@ use crate::{ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; -/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<10, false>`. +/// YUV 4:4:4 planar 10-bit → u8 RGB. Endian-aware variant. `big_endian +/// = true` selects the BE-encoded `u16` plane contract (samples stored +/// MSB-first); `false` is the standard LE contract. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_row::<10, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_row::<10, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p10_to_rgb_row_endian`]; preserves the +/// pre-endian-aware public signature so existing little-endian callers +/// compile unchanged. Equivalent to `yuv444p10_to_rgb_row_endian(..., +/// big_endian = false)`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv444p10_to_rgb_row( @@ -33,10 +58,32 @@ pub fn yuv444p10_to_rgb_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p10_to_rgb_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } -/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. +/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. Endian-aware +/// variant. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_u16_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_u16_row::<10, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_u16_row::<10, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p10_to_rgb_u16_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuv444p10_to_rgb_u16_row( @@ -49,16 +96,17 @@ pub fn yuv444p10_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p10_to_rgb_u16_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). +/// **RGBA** (`R, G, B, 0xFF`). Endian-aware variant: `big_endian = +/// true` selects the BE-encoded `u16` plane contract. /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_row( +pub fn yuv444p10_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -67,6 +115,7 @@ pub fn yuv444p10_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -74,46 +123,57 @@ pub fn yuv444p10_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -121,17 +181,40 @@ pub fn yuv444p10_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p10_to_rgba_row_endian`]; preserves +/// the pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p10_to_rgba_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** /// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha -/// element is `1023`. +/// element is `1023`. Endian-aware variant. /// /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_u16_row( +pub fn yuv444p10_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -140,6 +223,7 @@ pub fn yuv444p10_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -147,46 +231,57 @@ pub fn yuv444p10_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -194,5 +289,26 @@ pub fn yuv444p10_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_u16_row::<10, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_u16_row::<10, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p10_to_rgba_u16_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p10_to_rgba_u16_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs index 2eec3e85..9782ae3f 100644 --- a/src/row/dispatch/yuv444/yuv444p12.rs +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -22,6 +22,28 @@ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; /// YUV 4:4:4 planar 12-bit → u8 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_row::<12, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_row::<12, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p12_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p12_to_rgb_row( y: &[u16], u: &[u16], @@ -32,12 +54,34 @@ pub fn yuv444p12_to_rgb_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p12_to_rgb_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// YUV 4:4:4 planar 12-bit → native-depth u16 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_u16_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_u16_row::<12, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_u16_row::<12, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p12_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p12_to_rgb_u16_row( y: &[u16], u: &[u16], @@ -48,7 +92,7 @@ pub fn yuv444p12_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p12_to_rgb_u16_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** @@ -57,7 +101,7 @@ pub fn yuv444p12_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_row( +pub fn yuv444p12_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -66,6 +110,7 @@ pub fn yuv444p12_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -73,46 +118,57 @@ pub fn yuv444p12_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -120,7 +176,29 @@ pub fn yuv444p12_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p12_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p12_to_rgba_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** @@ -130,7 +208,7 @@ pub fn yuv444p12_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_u16_row( +pub fn yuv444p12_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -139,6 +217,7 @@ pub fn yuv444p12_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -146,46 +225,57 @@ pub fn yuv444p12_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -193,5 +283,27 @@ pub fn yuv444p12_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_u16_row::<12, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_u16_row::<12, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p12_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p12_to_rgba_u16_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs index 0d6f7104..ef667968 100644 --- a/src/row/dispatch/yuv444/yuv444p14.rs +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -22,6 +22,28 @@ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; /// YUV 4:4:4 planar 14-bit → u8 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_row::<14, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_row::<14, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p14_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p14_to_rgb_row( y: &[u16], u: &[u16], @@ -32,12 +54,34 @@ pub fn yuv444p14_to_rgb_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p14_to_rgb_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// YUV 4:4:4 planar 14-bit → native-depth u16 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_u16_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_u16_row::<14, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_u16_row::<14, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p14_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p14_to_rgb_u16_row( y: &[u16], u: &[u16], @@ -48,7 +92,7 @@ pub fn yuv444p14_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p14_to_rgb_u16_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** @@ -57,7 +101,7 @@ pub fn yuv444p14_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_row( +pub fn yuv444p14_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -66,6 +110,7 @@ pub fn yuv444p14_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -73,46 +118,57 @@ pub fn yuv444p14_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -120,7 +176,29 @@ pub fn yuv444p14_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p14_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p14_to_rgba_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** @@ -130,7 +208,7 @@ pub fn yuv444p14_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_u16_row( +pub fn yuv444p14_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -139,6 +217,7 @@ pub fn yuv444p14_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -146,46 +225,57 @@ pub fn yuv444p14_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -193,5 +283,27 @@ pub fn yuv444p14_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_u16_row::<14, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_u16_row::<14, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p14_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p14_to_rgba_u16_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs index c32ad5ab..6a7211f1 100644 --- a/src/row/dispatch/yuv444/yuv444p16.rs +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -24,7 +24,7 @@ use crate::{ /// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_row( +pub fn yuv444p16_to_rgb_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -33,6 +33,7 @@ pub fn yuv444p16_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgb_min = rgb_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -40,46 +41,57 @@ pub fn yuv444p16_to_rgb_row( assert!(v.len() >= width, "v row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -87,7 +99,27 @@ pub fn yuv444p16_to_rgb_row( } } - scalar::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range), + scalar::yuv_444p16_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p16_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p16_to_rgb_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range @@ -95,7 +127,7 @@ pub fn yuv444p16_to_rgb_row( /// i64 to avoid i32 overflow at 16-bit limited range. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_u16_row( +pub fn yuv444p16_to_rgb_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -104,6 +136,7 @@ pub fn yuv444p16_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgb_min = rgb_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -111,46 +144,57 @@ pub fn yuv444p16_to_rgb_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); } + ); return; } }, @@ -158,7 +202,27 @@ pub fn yuv444p16_to_rgb_u16_row( } } - scalar::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range), + scalar::yuv_444p16_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p16_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p16_to_rgb_u16_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** @@ -168,7 +232,7 @@ pub fn yuv444p16_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_row( +pub fn yuv444p16_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -177,6 +241,7 @@ pub fn yuv444p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -184,46 +249,57 @@ pub fn yuv444p16_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -231,7 +307,29 @@ pub fn yuv444p16_to_rgba_row( } } - scalar::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p16_to_rgba_row::(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p16_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p16_to_rgba_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** @@ -242,7 +340,7 @@ pub fn yuv444p16_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_u16_row( +pub fn yuv444p16_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -251,6 +349,7 @@ pub fn yuv444p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -258,46 +357,57 @@ pub fn yuv444p16_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -305,5 +415,27 @@ pub fn yuv444p16_to_rgba_u16_row( } } - scalar::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p16_to_rgba_u16_row::(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p16_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p16_to_rgba_u16_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs index 04bb0a5e..9d685f4b 100644 --- a/src/row/dispatch/yuv444/yuv444p9.rs +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -27,6 +27,28 @@ use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; /// crate-internal `yuv_444p_n_to_rgb_row::<9, false>`. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_row::<9, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_row::<9, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p9_to_rgb_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p9_to_rgb_row( y: &[u16], u: &[u16], @@ -37,12 +59,34 @@ pub fn yuv444p9_to_rgb_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p9_to_rgb_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } /// YUV 4:4:4 planar 9-bit → native-depth u16 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_u16_row_endian( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, +) { + if big_endian { + yuv_444p_n_to_rgb_u16_row::<9, true>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } else { + yuv_444p_n_to_rgb_u16_row::<9, false>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + } +} + +/// LE-only wrapper around [`yuv444p9_to_rgb_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] pub fn yuv444p9_to_rgb_u16_row( y: &[u16], u: &[u16], @@ -53,7 +97,7 @@ pub fn yuv444p9_to_rgb_u16_row( full_range: bool, use_simd: bool, ) { - yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); + yuv444p9_to_rgb_u16_row_endian(y, u, v, rgb_out, width, matrix, full_range, use_simd, false); } // ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- @@ -73,7 +117,7 @@ pub fn yuv444p9_to_rgb_u16_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_row( +pub fn yuv444p9_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -82,6 +126,7 @@ pub fn yuv444p9_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); @@ -89,46 +134,57 @@ pub fn yuv444p9_to_rgba_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -136,7 +192,29 @@ pub fn yuv444p9_to_rgba_row( } } - scalar::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p9_to_rgba_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p9_to_rgba_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } /// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** @@ -148,7 +226,7 @@ pub fn yuv444p9_to_rgba_row( /// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_u16_row( +pub fn yuv444p9_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -157,6 +235,7 @@ pub fn yuv444p9_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -164,46 +243,57 @@ pub fn yuv444p9_to_rgba_u16_row( assert!(v.len() >= width, "v row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -211,5 +301,27 @@ pub fn yuv444p9_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range); + dispatch_be!( + scalar::yuv_444p_n_to_rgba_u16_row::<9, false>(y, u, v, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_u16_row::<9, true>(y, u, v, rgba_out, width, matrix, full_range) + ); +} + +/// LE-only wrapper around [`yuv444p9_to_rgba_u16_row_endian`]; preserves the pre-endian-aware +/// public signature so existing little-endian callers compile unchanged. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv444p9_to_rgba_u16_row_endian( + y, u, v, rgba_out, width, matrix, full_range, use_simd, false, + ); } diff --git a/src/row/mod.rs b/src/row/mod.rs index b2502de8..ffde246f 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -800,7 +800,7 @@ mod overflow_tests { let u: [u16; 0] = []; let v: [u16; 0] = []; let mut rgb: [u16; 0] = []; - yuv_444p_n_to_rgb_u16_row::<10>( + yuv_444p_n_to_rgb_u16_row::<10, false>( &y, &u, &v, From b882233e5d9edea7829132da8472832855375403 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 14:39:49 +1200 Subject: [PATCH 7/8] feat(be-yuv-hb): wire endian-aware row dispatch for YUVA high-bit families MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-5 found that the BE row kernels added in this PR are not reachable from the public row dispatch API for the high-bit YUVA 4:2:0 (yuva420p9/10/12/16) and YUVA 4:4:4 (yuva444p9/10/12/14/16) families: the existing dispatcher signatures hard-wire `BE = false` to every SIMD/scalar backend, mirroring the same wiring gap fixed for non-alpha high-bit YUV planar formats in commit `1c2df3d`. BE-encoded YUVA input fed through these dispatchers was silently decoded as LE — corrupting Y/U/V *and* alpha samples without an error path. This commit adds endian-aware dispatch entry points across the two YUVA high-bit dispatcher files, matching the pattern already established by `1c2df3d`: * For each high-bit `yuvap_to_rgba[_u16]_row(...)` public dispatcher, add a new `yuvap_to_rgba[_u16]_row_endian(..., big_endian: bool)` variant that routes `BE=true` through every backend (NEON / SSE4.1 / AVX2 / AVX-512BW / wasm-simd128 / scalar) via the existing `` (or `` for 16-bit) const-generic kernel pair. The alpha-source `u16` load is endian-aware too — the kernels already thread `BE` through `endian::load_endian_u16x*` for the alpha lane, so the dispatcher fix is purely at the dispatch site. * The original LE-only function is kept as a one-line backwards-compat wrapper that calls `_endian(..., false)`, preserving the LE-encoded `Frame` byte contract for sinkers (sinker call sites in `src/sinker/mixed/**` are deliberately unchanged — they continue to use the LE wrappers, matching the convention from PR #92). New `_endian` entry points (18 total): * sub_4_2_0.rs (yuva420p{9,10,12,16}, each × {rgba_u8, rgba_u16}) = 8 * sub_4_4_4.rs (yuva444p{9,10,12,14,16}, each × {rgba_u8, rgba_u16}) = 10 Kernel-side BE wiring: no changes needed. All YUVA kernels (`yuv_420p_n_to_rgba[_u16]_with_alpha_src_row`, `yuv_444p_n_to_rgba[_u16]_with_alpha_src_row`, `yuv_420p16_to_rgba[_u16]_with_alpha_src_row`, `yuv_444p16_to_rgba[_u16]_with_alpha_src_row`) across scalar / NEON / SSE4.1 / AVX2 / AVX-512BW / wasm-simd128 already accept and correctly route `` through the endian-aware u16 load helpers, including the alpha-source load lane. This was confirmed by source audit of `src/row/scalar/yuv_planar_high_bit.rs`, `src/row/scalar/yuv_planar_16bit.rs`, and the per-arch `yuv_planar_high_bit.rs` / `yuv_planar_16bit.rs` files; the gap was purely in the dispatch wrapper layer. YUVA 4:2:2 audit: there is no separate `src/row/dispatch/yuva/sub_4_2_2.rs` file. Per `yuva/mod.rs`'s module docs, `MixedSinker` delegates row-level work to the `yuva420p*_to_rgba*_with_alpha_src_row` dispatchers (the per-row chroma layout is identical between 4:2:0 and 4:2:2; only the vertical walker differs). Fixing the 4:2:0 dispatcher therefore also reaches Yuva422p10/12/16 BE consumers via the existing delegation. Adds dispatch-level BE/LE parity tests in `src/row/dispatch/be_yuv_hb_parity_tests.rs` mirroring the existing `yuv444p10` / `yuv420p16` / `yuv444p16` patterns: * `yuva420p10_dispatch_be_le_parity_simd_and_scalar` — w ∈ {8, 16, 24}; rgba u8 + rgba u16; `use_simd` ∈ {false, true}. * `yuva444p10_dispatch_be_le_parity_simd_and_scalar` — same matrix. * `yuva420p16_dispatch_be_le_parity` — dedicated 16-bit i64-chroma family. * `yuva444p16_dispatch_be_le_parity` — same. Each test builds LE/BE fixtures via `to_le_bytes` / `to_be_bytes` + `u16::from_ne_bytes` (host-independent — no `cfg(target_endian)` gate) including the alpha plane, and asserts byte-identical output between `_endian(LE, false)` and `_endian(BE, true)`. SIMD-active variants carry `#[cfg_attr(miri, ignore)]`. 4 new tests; all pass on aarch64-apple-darwin alongside the pre-existing 2354 lib tests for a total of 2358. Verified: * `cargo test --target aarch64-apple-darwin --lib` — 2358 passed * `cargo build --target x86_64-apple-darwin --tests` — 0 warnings * `RUSTFLAGS="-C target-feature=+simd128" cargo build --target wasm32-unknown-unknown --tests` — clean * `cargo build --no-default-features` — clean * `cargo fmt --check` — clean * `cargo clippy --all-targets --all-features -- -D warnings` — clean * `cargo check --target s390x-unknown-linux-gnu --lib` — clean Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/be_yuv_hb_parity_tests.rs | 328 ++++++ src/row/dispatch/yuva/sub_4_2_0.rs | 746 ++++++++----- src/row/dispatch/yuva/sub_4_4_4.rs | 1118 ++++++-------------- 3 files changed, 1153 insertions(+), 1039 deletions(-) diff --git a/src/row/dispatch/be_yuv_hb_parity_tests.rs b/src/row/dispatch/be_yuv_hb_parity_tests.rs index 02257ad7..46716bf8 100644 --- a/src/row/dispatch/be_yuv_hb_parity_tests.rs +++ b/src/row/dispatch/be_yuv_hb_parity_tests.rs @@ -722,3 +722,331 @@ fn p416_dispatch_be_le_parity() { assert_eq!(out_le4u, out_be4u, "p416 rgba_u16 BE/LE parity"); } } + +// ---- YUVA dispatch parity (codex round-5 follow-up) --------------------- +// +// Mirrors the non-alpha YUV high-bit dispatcher tests above. Adds +// dispatch-level BE/LE parity coverage for the YUVA 4:2:0 and 4:4:4 +// families — which were missed by round-3 and forced through +// `BE = false` regardless of the source contract. Uses the same +// `to_le_bytes` / `to_be_bytes` host-independent fixture pattern; +// asserts byte-identical output between +// `_endian(LE_buf, false)` and `_endian(BE_buf, true)` for both +// `use_simd = true` and `use_simd = false`. + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuva420p10_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + let y_int = pseudo_plane(w, 0x1010, 0x3FF); + let u_int = pseudo_plane(w / 2, 0x2020, 0x3FF); + let v_int = pseudo_plane(w / 2, 0x3030, 0x3FF); + let a_int = pseudo_plane(w, 0x4040, 0x3FF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + let (a_le, a_be) = split_le_be(&a_int); + + for &use_simd in &[false, true] { + // u8 RGBA — exercises BITS-generic + // `yuv_420p_n_to_rgba_with_alpha_src_row<10, BE>` across all backends. + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + yuva420p10_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva420p10_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le, out_be, + "yuva420p10 rgba BE/LE parity (w={w}, simd={use_simd})" + ); + + // u16 RGBA — native-depth path, alpha sourced at full BITS. + let mut out_le16 = std::vec![0u16; w * 4]; + let mut out_be16 = std::vec![0u16; w * 4]; + yuva420p10_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva420p10_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le16, out_be16, + "yuva420p10 rgba_u16 BE/LE parity (w={w}, simd={use_simd})" + ); + } + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuva444p10_dispatch_be_le_parity_simd_and_scalar() { + for w in [8usize, 16, 24] { + let y_int = pseudo_plane(w, 0x1111, 0x3FF); + let u_int = pseudo_plane(w, 0x2222, 0x3FF); + let v_int = pseudo_plane(w, 0x3333, 0x3FF); + let a_int = pseudo_plane(w, 0x4444, 0x3FF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + let (a_le, a_be) = split_le_be(&a_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + yuva444p10_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva444p10_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le, out_be, + "yuva444p10 rgba BE/LE parity (w={w}, simd={use_simd})" + ); + + let mut out_le16 = std::vec![0u16; w * 4]; + let mut out_be16 = std::vec![0u16; w * 4]; + yuva444p10_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva444p10_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!( + out_le16, out_be16, + "yuva444p10 rgba_u16 BE/LE parity (w={w}, simd={use_simd})" + ); + } + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuva420p16_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0xA1A1, 0xFFFF); + let u_int = pseudo_plane(w / 2, 0xB2B2, 0xFFFF); + let v_int = pseudo_plane(w / 2, 0xC3C3, 0xFFFF); + let a_int = pseudo_plane(w, 0xD4D4, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + let (a_le, a_be) = split_le_be(&a_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + yuva420p16_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva420p16_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "yuva420p16 rgba BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 4]; + let mut out_be16 = std::vec![0u16; w * 4]; + yuva420p16_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva420p16_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuva420p16 rgba_u16 BE/LE parity"); + } +} + +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn yuva444p16_dispatch_be_le_parity() { + let w = 16usize; + let y_int = pseudo_plane(w, 0xA5A5, 0xFFFF); + let u_int = pseudo_plane(w, 0xB6B6, 0xFFFF); + let v_int = pseudo_plane(w, 0xC7C7, 0xFFFF); + let a_int = pseudo_plane(w, 0xD8D8, 0xFFFF); + let (y_le, y_be) = split_le_be(&y_int); + let (u_le, u_be) = split_le_be(&u_int); + let (v_le, v_be) = split_le_be(&v_int); + let (a_le, a_be) = split_le_be(&a_int); + + for &use_simd in &[false, true] { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + yuva444p16_to_rgba_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva444p16_to_rgba_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le, out_be, "yuva444p16 rgba BE/LE parity"); + + let mut out_le16 = std::vec![0u16; w * 4]; + let mut out_be16 = std::vec![0u16; w * 4]; + yuva444p16_to_rgba_u16_row_endian( + &y_le, + &u_le, + &v_le, + &a_le, + &mut out_le16, + w, + ColorMatrix::Bt709, + false, + use_simd, + false, + ); + yuva444p16_to_rgba_u16_row_endian( + &y_be, + &u_be, + &v_be, + &a_be, + &mut out_be16, + w, + ColorMatrix::Bt709, + false, + use_simd, + true, + ); + assert_eq!(out_le16, out_be16, "yuva444p16 rgba_u16 BE/LE parity"); + } +} diff --git a/src/row/dispatch/yuva/sub_4_2_0.rs b/src/row/dispatch/yuva/sub_4_2_0.rs index b1e0840b..754402ca 100644 --- a/src/row/dispatch/yuva/sub_4_2_0.rs +++ b/src/row/dispatch/yuva/sub_4_2_0.rs @@ -18,12 +18,20 @@ use crate::{ // ---- YUVA 4:2:0 RGBA dispatchers -------------------------------------- // // Per-row dispatchers for the YUVA 4:2:0 source family — Yuva420p -// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p16. The u8 RGBA -// dispatchers route through per-arch +// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p12 / Yuva420p16. The u8 +// RGBA dispatchers route through per-arch // `yuv_420*_to_rgba*_with_alpha_src_row` SIMD wrappers (Ship 8b-2b), // mirroring the non-alpha sibling dispatchers' `cfg_select!` blocks. -// The native-depth `u16` RGBA dispatchers below remain scalar pending -// Ship 8b-2c. +// +// The high-bit dispatchers (`yuva420p9/10/12/16`) each expose an +// `_endian` entry point that threads a runtime `big_endian: bool` +// through every backend (scalar / NEON / SSE4.1 / AVX2 / AVX-512BW / +// wasm-simd128) via the kernels' `` (or `` for 16-bit) +// const-generic pair, including the alpha-source u16 load. The +// pre-existing LE-only public function is preserved as a one-line +// wrapper that forwards `big_endian = false`, mirroring the pattern +// established for non-alpha YUV high-bit dispatchers in commit +// `1c2df3d`. /// Converts one row of 8‑bit YUVA 4:2:0 to packed **8‑bit** **RGBA**. /// R / G / B are produced by the same Q15 i32 8‑bit kernel that backs @@ -117,16 +125,19 @@ pub fn yuva420p_to_rgba_row( } /// Converts one row of **9‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv420p9_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`) -/// instead of being constant `0xFF`. +/// **RGBA**. Endian-aware variant: `big_endian = true` selects the +/// BE-encoded `u16` plane contract (samples stored MSB-first across +/// Y / U / V **and** the alpha source plane); `false` is the +/// standard LE contract. R / G / B are produced by the same Q15 i32 +/// kernel family that backs [`yuv420p9_to_rgba_row_endian`]; the +/// per-pixel alpha byte is **sourced from `a`** (depth-converted via +/// `a >> 1` to fit `u8`) instead of being constant `0xFF`. /// /// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p9_to_rgba_row`]'s pattern. +/// per-arch dispatch matches [`yuv420p9_to_rgba_row_endian`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p9_to_rgba_row( +pub fn yuva420p9_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -136,6 +147,7 @@ pub fn yuva420p9_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -145,56 +157,57 @@ pub fn yuva420p9_to_rgba_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -202,22 +215,49 @@ pub fn yuva420p9_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p9_to_rgba_row_endian`]; preserves +/// the pre-endian-aware public signature so existing little-endian +/// callers compile unchanged. Equivalent to +/// `yuva420p9_to_rgba_row_endian(.., big_endian = false)`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p9_to_rgba_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **9‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `511`. +/// packed **RGBA**. Endian-aware variant. Output is low-bit-packed +/// (`[0, 511]`); the per-pixel alpha element is **sourced from `a`** +/// (already at the source's native bit depth). /// /// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern. +/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row_endian`]'s +/// pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p9_to_rgba_u16_row( +pub fn yuva420p9_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -227,6 +267,7 @@ pub fn yuva420p9_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -236,56 +277,57 @@ pub fn yuva420p9_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -293,22 +335,44 @@ pub fn yuva420p9_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p9_to_rgba_u16_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p9_to_rgba_u16_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **10‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv420p10_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) -/// instead of being constant `0xFF`. +/// **RGBA**. Endian-aware variant: `big_endian = true` selects the +/// BE-encoded `u16` plane contract for Y / U / V **and** the alpha +/// source plane. /// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p10_to_rgba_row`]'s pattern. +/// `use_simd = false` forces the scalar reference path. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p10_to_rgba_row( +pub fn yuva420p10_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -318,6 +382,7 @@ pub fn yuva420p10_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -327,56 +392,57 @@ pub fn yuva420p10_to_rgba_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -384,20 +450,42 @@ pub fn yuva420p10_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p10_to_rgba_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p10_to_rgba_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **10‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the -/// per-pixel alpha element is **sourced from `a`** at native depth. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern. +/// packed **RGBA**. Endian-aware variant. Output is low-bit-packed +/// (`[0, 1023]`); the per-pixel alpha element is sourced from `a` at +/// native depth. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p10_to_rgba_u16_row( +pub fn yuva420p10_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -407,6 +495,7 @@ pub fn yuva420p10_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -416,56 +505,57 @@ pub fn yuva420p10_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -473,22 +563,40 @@ pub fn yuva420p10_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p10_to_rgba_u16_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p10_to_rgba_u16_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **12‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv420p12_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 4` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p12_to_rgba_row`]'s pattern. +/// **RGBA**. Endian-aware variant. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p12_to_rgba_row( +pub fn yuva420p12_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -498,6 +606,7 @@ pub fn yuva420p12_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -507,56 +616,57 @@ pub fn yuva420p12_to_rgba_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -564,20 +674,40 @@ pub fn yuva420p12_to_rgba_row( } } - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<12, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p12_to_rgba_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p12_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p12_to_rgba_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **12‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); the -/// per-pixel alpha element is **sourced from `a`** at native depth. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p12_to_rgba_u16_row`]'s pattern. +/// packed **RGBA**. Endian-aware variant. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p12_to_rgba_u16_row( +pub fn yuva420p12_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -587,6 +717,7 @@ pub fn yuva420p12_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -596,56 +727,57 @@ pub fn yuva420p12_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -653,21 +785,41 @@ pub fn yuva420p12_to_rgba_u16_row( } } - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, false>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<12, true>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p12_to_rgba_u16_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p12_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p12_to_rgba_u16_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **16‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same i32 kernel that backs -/// [`yuv420p16_to_rgba_row`]; the per-pixel alpha byte is **sourced -/// from `a`** (depth-converted via `a >> 8` to fit `u8`). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p16_to_rgba_row`]'s pattern. +/// **RGBA**. Endian-aware variant. Uses the dedicated 16-bit i32 +/// chroma kernel family (i64 widening only on the u16 RGBA path). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p16_to_rgba_row( +pub fn yuva420p16_to_rgba_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -677,6 +829,7 @@ pub fn yuva420p16_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_bytes(width); @@ -686,56 +839,57 @@ pub fn yuva420p16_to_rgba_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -743,20 +897,42 @@ pub fn yuva420p16_to_rgba_row( } } - scalar::yuv_420p16_to_rgba_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p16_to_rgba_with_alpha_src_row::( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p16_to_rgba_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p16_to_rgba_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } /// Converts one row of **16‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel -/// alpha element is **sourced from `a`** at native depth (no shift). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern. +/// packed **RGBA**. Endian-aware variant. Full-range output in +/// `[0, 65535]`; the per-pixel alpha element is sourced from `a` at +/// native depth (no shift). #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva420p16_to_rgba_u16_row( +pub fn yuva420p16_to_rgba_u16_row_endian( y: &[u16], u_half: &[u16], v_half: &[u16], @@ -766,6 +942,7 @@ pub fn yuva420p16_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); let rgba_min = rgba_row_elems(width); @@ -775,56 +952,57 @@ pub fn yuva420p16_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row::(y, u_half, v_half, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -832,7 +1010,31 @@ pub fn yuva420p16_to_rgba_u16_row( } } - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row::( + y, u_half, v_half, a, rgba_out, width, matrix, full_range + ) + ); +} + +/// LE-only wrapper around [`yuva420p16_to_rgba_u16_row_endian`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuva420p16_to_rgba_u16_row_endian( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, use_simd, false, ); } diff --git a/src/row/dispatch/yuva/sub_4_4_4.rs b/src/row/dispatch/yuva/sub_4_4_4.rs index 8edce6fc..79195e72 100644 --- a/src/row/dispatch/yuva/sub_4_4_4.rs +++ b/src/row/dispatch/yuva/sub_4_4_4.rs @@ -22,6 +22,16 @@ use crate::{ // and Yuva444p16 (dedicated i64 16-bit family). Both the u8 and // native-depth `u16` RGBA dispatchers route through per-arch SIMD // wrappers, mirroring the non-alpha siblings. +// +// The high-bit dispatchers (`yuva444p9/10/12/14/16`) each expose an +// `_endian` entry point that threads a runtime `big_endian: bool` +// through every backend (scalar / NEON / SSE4.1 / AVX2 / AVX-512BW / +// wasm-simd128) via the kernels' `` (or `` for 16-bit) +// const-generic pair, including the alpha-source u16 load. The +// pre-existing LE-only public function is preserved as a one-line +// wrapper that forwards `big_endian = false`, matching the pattern +// established for non-alpha YUV high-bit dispatchers in commit +// `1c2df3d`. /// Converts one row of **8-bit** YUVA 4:4:4 to packed **8-bit** /// **RGBA**. R / G / B are produced by the same Q15 i32 8-bit kernel @@ -111,174 +121,340 @@ pub fn yuva444p_to_rgba_row( scalar::yuv_444_to_rgba_with_alpha_src_row(y, u, v, a, rgba_out, width, matrix, full_range); } -/// Converts one row of **9-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuva444p10_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuva444p10_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p9_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); +// ---- BITS-generic 9/10/12/14 helpers (mirror non-alpha pattern) ---- + +macro_rules! impl_yuva444p_n_endian_pair { + ( + $bits:literal, + $endian_u8:ident, + $le_u8:ident, + $endian_u16:ident, + $le_u16:ident + ) => { + /// 4:4:4 YUVA high-bit (`BITS`) → packed u8 RGBA. Endian-aware + /// variant: `big_endian = true` selects the BE-encoded `u16` plane + /// contract for Y / U / V **and** the alpha source plane. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub fn $endian_u8( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, + ) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + _ => {} + } + } + + dispatch_be!( + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range) + ); + } - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} + /// LE-only wrapper preserving the pre-endian-aware public signature. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub fn $le_u8( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + ) { + $endian_u8(y, u, v, a, rgba_out, width, matrix, full_range, use_simd, false); } - } - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); + /// 4:4:4 YUVA high-bit (`BITS`) → native-depth u16 RGBA. + /// Endian-aware variant. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub fn $endian_u16( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + big_endian: bool, + ) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + dispatch_be!( + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range); } + ); + return; + } + }, + _ => {} + } + } + + dispatch_be!( + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, false>(y, u, v, a, rgba_out, width, matrix, full_range), + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<$bits, true>(y, u, v, a, rgba_out, width, matrix, full_range) + ); + } + + /// LE-only wrapper preserving the pre-endian-aware public signature. + #[cfg_attr(not(tarpaulin), inline(always))] + #[allow(clippy::too_many_arguments)] + pub fn $le_u16( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, + ) { + $endian_u16(y, u, v, a, rgba_out, width, matrix, full_range, use_simd, false); + } + }; } -/// Converts one row of **9-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `511`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuva444p10_to_rgba_u16_row`]'s pattern. +impl_yuva444p_n_endian_pair!( + 9, + yuva444p9_to_rgba_row_endian, + yuva444p9_to_rgba_row, + yuva444p9_to_rgba_u16_row_endian, + yuva444p9_to_rgba_u16_row +); +impl_yuva444p_n_endian_pair!( + 10, + yuva444p10_to_rgba_row_endian, + yuva444p10_to_rgba_row, + yuva444p10_to_rgba_u16_row_endian, + yuva444p10_to_rgba_u16_row +); +impl_yuva444p_n_endian_pair!( + 12, + yuva444p12_to_rgba_row_endian, + yuva444p12_to_rgba_row, + yuva444p12_to_rgba_u16_row_endian, + yuva444p12_to_rgba_u16_row +); +impl_yuva444p_n_endian_pair!( + 14, + yuva444p14_to_rgba_row_endian, + yuva444p14_to_rgba_row, + yuva444p14_to_rgba_u16_row_endian, + yuva444p14_to_rgba_u16_row +); + +// ---- YUVA 4:4:4 16-bit RGBA dispatchers (Ship 8b-5a/b/c) ------------- +// +// Yuva444p16 uses dedicated 16-bit kernels rather than the +// BITS-generic Q15 i32 template (which only covers {9,10,12,14}). The +// 8-bit RGBA path uses the i32 chroma pipeline (output-target scaling +// keeps `coeff × u_d` inside i32); the native-depth `u16` RGBA path +// uses the widened i64 chroma kernel family. + +/// Converts one row of **16-bit** YUVA 4:4:4 to packed **8-bit** +/// **RGBA**. Endian-aware variant: `big_endian = true` selects the +/// BE-encoded `u16` plane contract for Y / U / V **and** the alpha +/// source plane. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva444p9_to_rgba_u16_row( +pub fn yuva444p16_to_rgba_row_endian( y: &[u16], u: &[u16], v: &[u16], a: &[u16], - rgba_out: &mut [u16], + rgba_out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { - let rgba_min = rgba_row_elems(width); + let rgba_min = rgba_row_bytes(width); assert!(y.len() >= width, "y row too short"); assert!(u.len() >= width, "u row too short"); assert!(v.len() >= width, "v row too short"); assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -286,22 +462,20 @@ pub fn yuva444p9_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<9, false>( - y, u, v, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( + y, u, v, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_444p16_to_rgba_with_alpha_src_row::( + y, u, v, a, rgba_out, width, matrix, full_range + ) ); } -/// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv444p10_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern. +/// LE-only wrapper around [`yuva444p16_to_rgba_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva444p10_to_rgba_row( +pub fn yuva444p16_to_rgba_row( y: &[u16], u: &[u16], v: &[u16], @@ -312,86 +486,17 @@ pub fn yuva444p10_to_rgba_row( full_range: bool, use_simd: bool, ) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, + yuva444p16_to_rgba_row_endian( + y, u, v, a, rgba_out, width, matrix, full_range, use_simd, false, ); } -/// Converts one row of **10-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `1023`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern. +/// Converts one row of **16-bit** YUVA 4:4:4 to **native-depth `u16`** +/// packed **RGBA**. Endian-aware variant. Full-range output in +/// `[0, 65535]`. Uses the i64 chroma kernel family. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva444p10_to_rgba_u16_row( +pub fn yuva444p16_to_rgba_u16_row_endian( y: &[u16], u: &[u16], v: &[u16], @@ -401,6 +506,7 @@ pub fn yuva444p10_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + big_endian: bool, ) { let rgba_min = rgba_row_elems(width); assert!(y.len() >= width, "y row too short"); @@ -409,56 +515,57 @@ pub fn yuva444p10_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + macro_rules! dispatch_be { + ($call_le:expr, $call_be:expr) => { + if big_endian { $call_be } else { $call_le } + }; + } + if use_simd { cfg_select! { target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::neon::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::neon::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx512::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_avx2::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::x86_sse41::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } + dispatch_be!( + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); }, + unsafe { arch::wasm_simd128::yuv_444p16_to_rgba_u16_with_alpha_src_row::(y, u, v, a, rgba_out, width, matrix, full_range); } + ); return; } }, @@ -466,554 +573,31 @@ pub fn yuva444p10_to_rgba_u16_row( } } - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10, false>( - y, u, v, a, rgba_out, width, matrix, full_range, + dispatch_be!( + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( + y, u, v, a, rgba_out, width, matrix, full_range + ), + scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( + y, u, v, a, rgba_out, width, matrix, full_range + ) ); } -/// Converts one row of **12-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv444p12_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 4` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p12_to_rgba_row`]'s pattern. +/// LE-only wrapper around [`yuva444p16_to_rgba_u16_row_endian`]. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] -pub fn yuva444p12_to_rgba_row( +pub fn yuva444p16_to_rgba_u16_row( y: &[u16], u: &[u16], v: &[u16], a: &[u16], - rgba_out: &mut [u8], + rgba_out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, use_simd: bool, ) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **12-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `4095`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p12_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p12_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<12, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} -/// Converts one row of **14-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv444p14_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 6` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p14_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p14_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **14-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `16383`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p14_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p14_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<14, false>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -// ---- YUVA 4:4:4 16-bit RGBA dispatchers (Ship 8b-5a/b/c) ------------- -// -// Yuva444p16 uses dedicated 16-bit kernels rather than the -// BITS-generic Q15 i32 template (which only covers {9,10,12,14}). The -// 8-bit RGBA path uses the i32 chroma pipeline (output-target scaling -// keeps `coeff × u_d` inside i32); the native-depth `u16` RGBA path -// uses the widened i64 chroma kernel family. Ship 8b-5b wired the u8 -// path; 8b-5c wires the u16 path. Both dispatchers now run cfg_select! -// per-arch with scalar fallback. - -/// Converts one row of **16-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same i32 kernel that backs -/// [`yuv444p16_to_rgba_row`]; the per-pixel alpha byte is **sourced -/// from `a`** (depth-converted via `a >> 8` to fit `u8`). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p16_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p16_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **16-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel -/// alpha element is **sourced from `a`** at native depth (no shift). -/// Uses the i64 chroma kernel family. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p16_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p16_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_u16_with_alpha_src_row::( - y, u, v, a, rgba_out, width, matrix, full_range, + yuva444p16_to_rgba_u16_row_endian( + y, u, v, a, rgba_out, width, matrix, full_range, use_simd, false, ); } From de86493530d470878321ccf47932d7bb20989935 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Sat, 9 May 2026 14:54:36 +1200 Subject: [PATCH 8/8] fix(be-yuv-hb): normalize u16::from_le before validation in YUVA high-bit Frame constructors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex round-6 review on PR #89 flagged a high-severity finding: - src/frame/yuva/sub_4_2_0.rs:665-715 — Yuva420pFrame16::try_new_checked range validators tested raw `u16` samples, so on a BE host a valid LE-encoded yuva420p10le sample 1023 (bytes [0xFF, 0x03]) reads back as host-native 0xFF03 and was rejected as out of range. This is the same bug class fixed in b9a6c19 for the non-alpha planar / Pn / Y2xx families; the YUVA family was missed in that sweep. The findings were not limited to 4:2:0: every YUVA high-bit checked constructor (4:2:0, 4:2:2, 4:4:4 — 9/10/12/14/16-bit aliases) had the same raw-`u16` validation pattern across all four planes (Y, U, V, **alpha**). The frame types document the **LE-encoded byte layout** contract (the `&[u16]` plane is the FFmpeg `*LE` byte buffer reinterpreted as `u16`). Normalize each sample with `u16::from_le` before the range check so the validator operates on the intended logical sample value on every host. On LE hosts `from_le` is a no-op; on BE hosts it byte-swaps each `u16` back into host-native form. Mirrors the established `Yuv420pFrame16::try_new_checked` pattern shipped in b9a6c19. Affected validators (every checked-constructor sample-scan loop in the YUVA family is patched — 12 plane-loops total, 4 per family): - src/frame/yuva/sub_4_2_0.rs: Yuva420pFrame16 (Y, U, V, A planes) - src/frame/yuva/sub_4_2_2.rs: Yuva422pFrame16 (Y, U, V, A planes) - src/frame/yuva/sub_4_4_4.rs: Yuva444pFrame16 (Y, U, V, A planes) The reported `value` in `SampleOutOfRange` errors is the normalized logical sample so callers can match it against the declared `max_valid` regardless of host endianness. Tests: add 6 host-independent BE-host regression tests in a new src/frame/tests/yuva_high_bit.rs module, building each plane from LE-encoded bytes via `to_le_bytes` and reading back via `from_ne_bytes` (no `cfg(target_endian = "little")` gate). Each YUVA family gets one positive case (valid LE buffer that would be rejected without `from_le` on a BE host, with a non-trivial alpha plane) and one negative case where the bad sample is planted on the **alpha** plane, ensuring alpha-plane validation is independently exercised. Covers Yuva420p10, Yuva422p10, Yuva444p10. Audit confirmed there are no other in-scope sample-scan validators in the YUVA tree: only the three `try_new_checked` constructors above, and `src/frame/yuva/mod.rs` declares no additional validators. All three families exist (4:2:2 was not absent from the tree). Out-of- scope frame families (Xv36, Bayer16) remain to be addressed in follow-up BE-support PRs. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/frame/tests/mod.rs | 1 + src/frame/tests/yuva_high_bit.rs | 165 +++++++++++++++++++++++++++++++ src/frame/yuva/sub_4_2_0.rs | 36 +++++-- src/frame/yuva/sub_4_2_2.rs | 36 +++++-- src/frame/yuva/sub_4_4_4.rs | 36 +++++-- 5 files changed, 250 insertions(+), 24 deletions(-) create mode 100644 src/frame/tests/yuva_high_bit.rs diff --git a/src/frame/tests/mod.rs b/src/frame/tests/mod.rs index 05f7e180..eace4fca 100644 --- a/src/frame/tests/mod.rs +++ b/src/frame/tests/mod.rs @@ -24,6 +24,7 @@ mod v210; mod y2xx; mod ya16; mod ya8; +mod yuva_high_bit; // ---- 32-bit overflow regressions -------------------------------------- // diff --git a/src/frame/tests/yuva_high_bit.rs b/src/frame/tests/yuva_high_bit.rs new file mode 100644 index 00000000..f5d45ec5 --- /dev/null +++ b/src/frame/tests/yuva_high_bit.rs @@ -0,0 +1,165 @@ +use super::*; + +// ---- YUVA high-bit BE-host regression tests --------------------------- +// +// `Yuva420pFrame16` / `Yuva422pFrame16` / `Yuva444pFrame16` document a +// **LE-encoded byte-layout** contract on their `&[u16]` planes (the +// FFmpeg `*LE` byte buffer reinterpreted as `u16`). The +// `try_new_checked` validators must therefore normalize each sample +// with `u16::from_le` before comparing against `max_valid`; otherwise +// a valid LE-encoded plane on a BE host has every `u16` byte-swapped +// relative to the intended logical value and the validator falsely +// rejects every row. +// +// These tests build the planes explicitly from LE-encoded bytes via +// `to_le_bytes` and read back as `&[u16]` via `from_ne_bytes`. On an +// LE host the resulting `u16` values are identical to the intended +// literals; on a BE host every `u16` is byte-swapped relative to the +// intent, exercising the `u16::from_le` normalization inside the +// validators. Without that normalization the validators would falsely +// reject every valid LE-encoded plane on a BE host. +// +// Each family covers (1) a positive case — a logical LE buffer of +// valid samples (including the alpha plane) that must be accepted on +// both LE and BE hosts — and (2) a negative case where a sample is +// invalid even after `from_le` normalization, ensuring the validator +// still surfaces real errors. Negative cases place the bad sample on +// the alpha plane to give that plane its own dedicated coverage. + +/// Build a `Vec` representing the LE-encoded byte layout of +/// `intended` (i.e., what FFmpeg would emit on the wire). On an LE +/// host the result equals `intended` element-wise; on a BE host every +/// element is byte-swapped relative to `intended`. +fn le_encoded_u16_buf(intended: &[u16]) -> std::vec::Vec { + let bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect() +} + +// ---- Yuva420p10 ------------------------------------------------------- + +#[test] +fn yuva420p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 10-bit-low-packed white = 1023 (LE bytes [0xFF, 0x03]). The alpha + // plane is full-width × full-height; chroma is half × half. + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 4]; + let intended_a = std::vec![1023u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + let a = le_encoded_u16_buf(&intended_a); + Yuva420p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 8, 8, 16) + .expect("LE-encoded valid yuva420p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuva420p10_try_new_checked_rejects_le_encoded_alpha_out_of_range_on_any_host() { + // After `u16::from_le` normalization the offending alpha sample is + // 1024 (just above the 10-bit max of 1023). On both LE and BE hosts + // the validator must catch this — the LE-encoded byte buffer carries + // the logical value 1024 in `a[3 * 16 + 5]`. + let intended_y = std::vec![0u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 4]; + let mut intended_a = std::vec![1023u16; 16 * 8]; + intended_a[3 * 16 + 5] = 1024; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + let a = le_encoded_u16_buf(&intended_a); + let e = Yuva420p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 8, 8, 16).unwrap_err(); + assert!(matches!( + e, + Yuva420pFrame16Error::SampleOutOfRange { + plane: Yuva420pFrame16Plane::A, + value: 1024, + max_valid: 1023, + .. + } + )); +} + +// ---- Yuva422p10 ------------------------------------------------------- + +#[test] +fn yuva422p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:2:2 geometry: Y/A are full-width × full-height; U/V are + // half-width × full-height. 10-bit white = 1023 (LE bytes + // [0xFF, 0x03]). + let intended_y = std::vec![1023u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 8]; + let intended_a = std::vec![1023u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + let a = le_encoded_u16_buf(&intended_a); + Yuva422p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 8, 8, 16) + .expect("LE-encoded valid yuva422p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuva422p10_try_new_checked_rejects_le_encoded_alpha_out_of_range_on_any_host() { + // Plant an out-of-range logical alpha sample (1024 > 10-bit max + // 1023) in the LE byte buffer. The validator must surface the + // normalized logical value on both LE and BE hosts. + let intended_y = std::vec![0u16; 16 * 8]; + let intended_uv = std::vec![512u16; 8 * 8]; + let mut intended_a = std::vec![1023u16; 16 * 8]; + intended_a[2 * 16 + 7] = 1024; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_uv); + let v = le_encoded_u16_buf(&intended_uv); + let a = le_encoded_u16_buf(&intended_a); + let e = Yuva422p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 8, 8, 16).unwrap_err(); + assert!(matches!( + e, + Yuva422pFrame16Error::SampleOutOfRange { + plane: Yuva422pFrame16Plane::A, + value: 1024, + max_valid: 1023, + .. + } + )); +} + +// ---- Yuva444p10 ------------------------------------------------------- + +#[test] +fn yuva444p10_try_new_checked_accepts_le_encoded_buffer_on_any_host() { + // 4:4:4 geometry: every plane (Y, U, V, A) is full-width × + // full-height. 10-bit white = 1023 (LE bytes [0xFF, 0x03]). + let intended_full = std::vec![1023u16; 16 * 8]; + let intended_chroma = std::vec![512u16; 16 * 8]; + let y = le_encoded_u16_buf(&intended_full); + let u = le_encoded_u16_buf(&intended_chroma); + let v = le_encoded_u16_buf(&intended_chroma); + let a = le_encoded_u16_buf(&intended_full); + Yuva444p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 16, 16, 16) + .expect("LE-encoded valid yuva444p10le must be accepted on both LE and BE hosts"); +} + +#[test] +fn yuva444p10_try_new_checked_rejects_le_encoded_alpha_out_of_range_on_any_host() { + // Plant an out-of-range logical alpha sample (1024 > 10-bit max + // 1023). The validator must catch this regardless of host endianness. + let intended_y = std::vec![0u16; 16 * 8]; + let intended_chroma = std::vec![512u16; 16 * 8]; + let mut intended_a = std::vec![1023u16; 16 * 8]; + intended_a[4 * 16 + 9] = 1024; + let y = le_encoded_u16_buf(&intended_y); + let u = le_encoded_u16_buf(&intended_chroma); + let v = le_encoded_u16_buf(&intended_chroma); + let a = le_encoded_u16_buf(&intended_a); + let e = Yuva444p10Frame::try_new_checked(&y, &u, &v, &a, 16, 8, 16, 16, 16, 16).unwrap_err(); + assert!(matches!( + e, + Yuva444pFrame16Error::SampleOutOfRange { + plane: Yuva444pFrame16Plane::A, + value: 1024, + max_valid: 1023, + .. + } + )); +} diff --git a/src/frame/yuva/sub_4_2_0.rs b/src/frame/yuva/sub_4_2_0.rs index 5295a310..74bc8adc 100644 --- a/src/frame/yuva/sub_4_2_0.rs +++ b/src/frame/yuva/sub_4_2_0.rs @@ -638,6 +638,20 @@ impl<'a, const BITS: u32> Yuva420pFrame16<'a, BITS> { /// /// Cost: one O(plane_size) linear scan per plane (Y, U, V, A — /// four planes total). + /// + /// Per the LE-encoded byte contract documented on the type, samples + /// are validated **after** `u16::from_le` normalization so the range + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form before the comparison. Without + /// this normalization a valid `yuva420p10le` plane on a BE host would + /// have its samples appear byte-swapped (e.g. `1023` encoded LE as + /// bytes `[0xFF, 0x03]` reads as host-native `0xFF03` on BE) and the + /// validator would falsely reject every row. The reported `value` in + /// the error is the normalized logical sample so callers can match it + /// against the declared `max_valid`. Mirrors the + /// `Yuv420pFrame16::try_new_checked` pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn try_new_checked( @@ -663,11 +677,14 @@ impl<'a, const BITS: u32> Yuva420pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + // Normalize from LE-encoded wire to host-native before the + // range check (no-op on LE host, byte-swap on BE host). + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva420pFrame16Error::SampleOutOfRange { plane: Yuva420pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -676,11 +693,12 @@ impl<'a, const BITS: u32> Yuva420pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * u_stride as usize; for (col, &s) in u[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva420pFrame16Error::SampleOutOfRange { plane: Yuva420pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -689,11 +707,12 @@ impl<'a, const BITS: u32> Yuva420pFrame16<'a, BITS> { for row in 0..chroma_h { let start = row * v_stride as usize; for (col, &s) in v[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva420pFrame16Error::SampleOutOfRange { plane: Yuva420pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -702,11 +721,12 @@ impl<'a, const BITS: u32> Yuva420pFrame16<'a, BITS> { for row in 0..h { let start = row * a_stride as usize; for (col, &s) in a[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva420pFrame16Error::SampleOutOfRange { plane: Yuva420pFrame16Plane::A, index: start + col, - value: s, + value: logical, max_valid, }); } diff --git a/src/frame/yuva/sub_4_2_2.rs b/src/frame/yuva/sub_4_2_2.rs index 1d618423..1a5032c1 100644 --- a/src/frame/yuva/sub_4_2_2.rs +++ b/src/frame/yuva/sub_4_2_2.rs @@ -649,6 +649,20 @@ impl<'a, const BITS: u32> Yuva422pFrame16<'a, BITS> { /// four planes total). The default [`Self::try_new`] skips this so /// the hot path (decoder output, already-conforming buffers) stays /// O(1). + /// + /// Per the LE-encoded byte contract documented on the type, samples + /// are validated **after** `u16::from_le` normalization so the range + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form before the comparison. Without + /// this normalization a valid `yuva422p10le` plane on a BE host would + /// have its samples appear byte-swapped (e.g. `1023` encoded LE as + /// bytes `[0xFF, 0x03]` reads as host-native `0xFF03` on BE) and the + /// validator would falsely reject every row. The reported `value` in + /// the error is the normalized logical sample so callers can match it + /// against the declared `max_valid`. Mirrors the + /// `Yuv422pFrame16::try_new_checked` pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn try_new_checked( @@ -673,11 +687,14 @@ impl<'a, const BITS: u32> Yuva422pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + // Normalize from LE-encoded wire to host-native before the + // range check (no-op on LE host, byte-swap on BE host). + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva422pFrame16Error::SampleOutOfRange { plane: Yuva422pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -686,11 +703,12 @@ impl<'a, const BITS: u32> Yuva422pFrame16<'a, BITS> { for row in 0..h { let start = row * u_stride as usize; for (col, &s) in u[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva422pFrame16Error::SampleOutOfRange { plane: Yuva422pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -699,11 +717,12 @@ impl<'a, const BITS: u32> Yuva422pFrame16<'a, BITS> { for row in 0..h { let start = row * v_stride as usize; for (col, &s) in v[start..start + chroma_w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva422pFrame16Error::SampleOutOfRange { plane: Yuva422pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -712,11 +731,12 @@ impl<'a, const BITS: u32> Yuva422pFrame16<'a, BITS> { for row in 0..h { let start = row * a_stride as usize; for (col, &s) in a[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva422pFrame16Error::SampleOutOfRange { plane: Yuva422pFrame16Plane::A, index: start + col, - value: s, + value: logical, max_valid, }); } diff --git a/src/frame/yuva/sub_4_4_4.rs b/src/frame/yuva/sub_4_4_4.rs index 5aa1fbc4..4ef8c287 100644 --- a/src/frame/yuva/sub_4_4_4.rs +++ b/src/frame/yuva/sub_4_4_4.rs @@ -308,6 +308,20 @@ impl<'a, const BITS: u32> Yuva444pFrame16<'a, BITS> { /// /// Cost: one O(plane_size) linear scan per plane (Y, U, V, A — four /// planes total). + /// + /// Per the LE-encoded byte contract documented on the type, samples + /// are validated **after** `u16::from_le` normalization so the range + /// check operates on the intended logical sample value on every host. + /// On little-endian hosts `from_le` is a no-op (the host-native `u16` + /// already matches the wire); on big-endian hosts it byte-swaps each + /// `u16` back into host-native form before the comparison. Without + /// this normalization a valid `yuva444p10le` plane on a BE host would + /// have its samples appear byte-swapped (e.g. `1023` encoded LE as + /// bytes `[0xFF, 0x03]` reads as host-native `0xFF03` on BE) and the + /// validator would falsely reject every row. The reported `value` in + /// the error is the normalized logical sample so callers can match it + /// against the declared `max_valid`. Mirrors the + /// `Yuv444pFrame16::try_new_checked` pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn try_new_checked( @@ -331,11 +345,14 @@ impl<'a, const BITS: u32> Yuva444pFrame16<'a, BITS> { for row in 0..h { let start = row * y_stride as usize; for (col, &s) in y[start..start + w].iter().enumerate() { - if s > max_valid { + // Normalize from LE-encoded wire to host-native before the + // range check (no-op on LE host, byte-swap on BE host). + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva444pFrame16Error::SampleOutOfRange { plane: Yuva444pFrame16Plane::Y, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -344,11 +361,12 @@ impl<'a, const BITS: u32> Yuva444pFrame16<'a, BITS> { for row in 0..h { let start = row * u_stride as usize; for (col, &s) in u[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva444pFrame16Error::SampleOutOfRange { plane: Yuva444pFrame16Plane::U, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -357,11 +375,12 @@ impl<'a, const BITS: u32> Yuva444pFrame16<'a, BITS> { for row in 0..h { let start = row * v_stride as usize; for (col, &s) in v[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva444pFrame16Error::SampleOutOfRange { plane: Yuva444pFrame16Plane::V, index: start + col, - value: s, + value: logical, max_valid, }); } @@ -370,11 +389,12 @@ impl<'a, const BITS: u32> Yuva444pFrame16<'a, BITS> { for row in 0..h { let start = row * a_stride as usize; for (col, &s) in a[start..start + w].iter().enumerate() { - if s > max_valid { + let logical = u16::from_le(s); + if logical > max_valid { return Err(Yuva444pFrame16Error::SampleOutOfRange { plane: Yuva444pFrame16Plane::A, index: start + col, - value: s, + value: logical, max_valid, }); }