diff --git a/benches/ayuv64_a_plus_combo.rs b/benches/ayuv64_a_plus_combo.rs index a2f2c307..68d27faa 100644 --- a/benches/ayuv64_a_plus_combo.rs +++ b/benches/ayuv64_a_plus_combo.rs @@ -116,6 +116,7 @@ fn bench_u8(c: &mut Criterion) { MATRIX, FULL_RANGE, use_simd, + false, ); ayuv64_to_rgba_row( black_box(&packed[p_off..p_off + row_elems]), @@ -124,6 +125,7 @@ fn bench_u8(c: &mut Criterion) { MATRIX, FULL_RANGE, use_simd, + false, ); } black_box((&rgb, &rgba)); @@ -201,6 +203,7 @@ fn bench_u16(c: &mut Criterion) { MATRIX, FULL_RANGE, use_simd, + false, ); ayuv64_to_rgba_u16_row( black_box(&packed[p_off..p_off + row_elems]), @@ -209,6 +212,7 @@ fn bench_u16(c: &mut Criterion) { MATRIX, FULL_RANGE, use_simd, + false, ); } black_box((&rgb, &rgba)); diff --git a/src/row/arch/neon/ayuv64.rs b/src/row/arch/neon/ayuv64.rs index e7fe910b..6ee9791c 100644 --- a/src/row/arch/neon/ayuv64.rs +++ b/src/row/arch/neon/ayuv64.rs @@ -14,6 +14,10 @@ //! producing `uint16x8_t` halves for each of the four channels: //! `a_lo/a_hi`, `y_lo/y_hi`, `u_lo/u_hi`, `v_lo/v_hi`. //! +//! For BE wire format (`BE = true`), each deinterleaved `uint16x8_t` +//! channel is byte-swapped via `bswap_u16x8_if_be::` after the +//! `vld4q_u16` call. +//! //! - u8 output: Y values are full 16-bit (0..65535), so //! `scale_y_u16_to_i16` is used (not `scale_y`, which would corrupt //! values > 32767). i32 chroma via `chroma_i16x8`. @@ -26,7 +30,7 @@ //! ## Tail //! //! `width % 16` remaining pixels fall through to the scalar -//! `ayuv64_to_rgb_or_rgba_row::` (or u16 version). +//! `ayuv64_to_rgb_or_rgba_row::` (or u16 version). use core::arch::aarch64::*; @@ -37,13 +41,13 @@ use crate::{ColorMatrix, row::scalar}; /// NEON AYUV64 → packed u8 RGB or RGBA. /// -/// Byte-identical to `scalar::ayuv64_to_rgb_or_rgba_row::`. +/// Byte-identical to `scalar::ayuv64_to_rgb_or_rgba_row::`. /// /// Valid monomorphizations: -/// - `` — RGB (α dropped) -/// - `` — RGBA, source α depth-converted u16 → u8 (`>> 8`) +/// - `` — RGB (α dropped) +/// - `` — RGBA, source α depth-converted u16 → u8 (`>> 8`) /// -/// `` is rejected at monomorphization via `const { assert! }`. +/// `` is rejected at monomorphization via `const { assert! }`. /// /// # Safety /// @@ -52,7 +56,11 @@ use crate::{ColorMatrix, row::scalar}; /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -91,16 +99,16 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(q_lo.0); + let y_lo_u16 = bswap_u16x8_if_be::(q_lo.1); + let u_lo_u16 = bswap_u16x8_if_be::(q_lo.2); + let v_lo_u16 = bswap_u16x8_if_be::(q_lo.3); - let a_hi_u16 = q_hi.0; // uint16x8_t — A for pixels 8..15 - let y_hi_u16 = q_hi.1; // uint16x8_t — Y for pixels 8..15 - let u_hi_u16 = q_hi.2; // uint16x8_t — U for pixels 8..15 - let v_hi_u16 = q_hi.3; // uint16x8_t — V for pixels 8..15 + let a_hi_u16 = bswap_u16x8_if_be::(q_hi.0); + let y_hi_u16 = bswap_u16x8_if_be::(q_hi.1); + let u_hi_u16 = bswap_u16x8_if_be::(q_hi.2); + let v_hi_u16 = bswap_u16x8_if_be::(q_hi.3); // Reinterpret chroma as signed i16 (bias subtraction fits i16: // chroma ∈ [0,65535], bias=32768, so (chroma-bias) ∈ [-32768,32767]). @@ -194,7 +202,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( + scalar::ayuv64_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -210,13 +218,13 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row`. +/// Byte-identical to `scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::`. /// /// Valid monomorphizations: -/// - `` — RGB u16 (α dropped) -/// - `` — RGBA u16, source α written direct (no conversion) +/// - `` — RGB u16 (α dropped) +/// - `` — RGBA u16, source α written direct (no conversion) /// -/// `` is rejected at monomorphization via `const { assert! }`. +/// `` is rejected at monomorphization via `const { assert! }`. /// /// # Safety /// @@ -225,7 +233,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -265,15 +277,16 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row(q_lo.0); + let y_lo_u16 = bswap_u16x8_if_be::(q_lo.1); + let u_lo_u16 = bswap_u16x8_if_be::(q_lo.2); + let v_lo_u16 = bswap_u16x8_if_be::(q_lo.3); - let a_hi_u16 = q_hi.0; - let y_hi_u16 = q_hi.1; - let u_hi_u16 = q_hi.2; - let v_hi_u16 = q_hi.3; + let a_hi_u16 = bswap_u16x8_if_be::(q_hi.0); + let y_hi_u16 = bswap_u16x8_if_be::(q_hi.1); + let u_hi_u16 = bswap_u16x8_if_be::(q_hi.2); + let v_hi_u16 = bswap_u16x8_if_be::(q_hi.3); // Chroma: widen u16 → i32, subtract bias, apply c_scale (Q15). // 4:4:4 — 8 per-pixel chroma values per half, split into 2 × i32x4. @@ -411,7 +424,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -427,7 +440,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -435,7 +448,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } } @@ -443,7 +456,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( /// to u8 via `>> 8`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -451,14 +464,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } } /// NEON AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -466,7 +479,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); } } @@ -474,7 +489,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( /// is written direct (no conversion). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -482,7 +497,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); } } @@ -491,7 +508,7 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// NEON AYUV64 → u8 luma. Y is the second u16 (slot 1) of each pixel /// quadruple; `vshrn_n_u16::<8>` narrows u16 → u8 (high byte = `>> 8`). /// -/// Byte-identical to `scalar::ayuv64_to_luma_row`. +/// Byte-identical to `scalar::ayuv64_to_luma_row::`. /// /// # Safety /// @@ -500,7 +517,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -510,16 +531,19 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid // Two vld4q_u16 loads: channel 1 (.1) = Y for each group of 8 pixels. let q_lo = vld4q_u16(packed.as_ptr().add(x * 4)); let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32)); + // Apply BE byte-swap to Y channel if needed. + let y_lo = bswap_u16x8_if_be::(q_lo.1); + let y_hi = bswap_u16x8_if_be::(q_hi.1); // vshrn_n_u16::<8>: narrows 8 u16 → 8 u8 by taking high byte (>> 8). - let y_lo_u8 = vshrn_n_u16::<8>(q_lo.1); - let y_hi_u8 = vshrn_n_u16::<8>(q_hi.1); + let y_lo_u8 = vshrn_n_u16::<8>(y_lo); + let y_hi_u8 = vshrn_n_u16::<8>(y_hi); vst1_u8(luma_out.as_mut_ptr().add(x), y_lo_u8); vst1_u8(luma_out.as_mut_ptr().add(x + 8), y_hi_u8); x += 16; } // Scalar tail. if x < width { - scalar::ayuv64_to_luma_row( + scalar::ayuv64_to_luma_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, @@ -533,7 +557,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// NEON AYUV64 → u16 luma. Direct copy of Y samples (slot 1, no shift — /// 16-bit native). /// -/// Byte-identical to `scalar::ayuv64_to_luma_u16_row`. +/// Byte-identical to `scalar::ayuv64_to_luma_u16_row::`. /// /// # Safety /// @@ -542,7 +566,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -552,14 +580,16 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] // Two vld4q_u16 loads: channel 1 (.1) = Y. let q_lo = vld4q_u16(packed.as_ptr().add(x * 4)); let q_hi = vld4q_u16(packed.as_ptr().add(x * 4 + 32)); - // Direct copy — Y samples are 16-bit native (no shift needed). - vst1q_u16(luma_out.as_mut_ptr().add(x), q_lo.1); - vst1q_u16(luma_out.as_mut_ptr().add(x + 8), q_hi.1); + // Apply BE byte-swap to Y channel if needed, then direct copy. + let y_lo = bswap_u16x8_if_be::(q_lo.1); + let y_hi = bswap_u16x8_if_be::(q_hi.1); + vst1q_u16(luma_out.as_mut_ptr().add(x), y_lo); + vst1q_u16(luma_out.as_mut_ptr().add(x + 8), y_hi); x += 16; } // Scalar tail. if x < width { - scalar::ayuv64_to_luma_u16_row( + scalar::ayuv64_to_luma_u16_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, diff --git a/src/row/arch/neon/mod.rs b/src/row/arch/neon/mod.rs index 86b8d00a..91144808 100644 --- a/src/row/arch/neon/mod.rs +++ b/src/row/arch/neon/mod.rs @@ -251,5 +251,65 @@ pub(super) fn scale_y_u16_i64( } } +// ---- BE helpers ---------------------------------------------------------- + +/// Compile-time host endianness. `true` on BE targets (e.g. `s390x`, +/// `powerpc`-BE), `false` on LE targets (e.g. `aarch64-apple-darwin`, +/// `x86_64`). +/// +/// Used by the conditional byte-swap helpers below to decide whether a raw +/// NEON load already matches the wire endian. Without this, the helpers +/// would only correctly handle two of the four `host × wire` quadrants. +const HOST_NATIVE_BE: bool = cfg!(target_endian = "big"); + +/// Conditionally byte-swap 8 u16 lanes in a NEON register so that the +/// returned value is in **host-native** byte order, regardless of the +/// host endianness. +/// +/// The gate is `BE != HOST_NATIVE_BE`: +/// +/// | wire `BE` | host | gate | action | +/// |-----------|------------|---------|-------------------| +/// | `false` | LE | `false` | no swap (LE→LE) | +/// | `false` | BE | `true` | swap (LE→BE) | +/// | `true` | LE | `true` | swap (BE→LE) | +/// | `true` | BE | `false` | no swap (BE→BE) | +/// +/// The unused branch is eliminated by the compiler — `BE` and +/// `HOST_NATIVE_BE` are both compile-time constants, so the gate folds. +/// +/// Used by the packed YUV 4:4:4 kernels (XV36, AYUV64) after `vld4q_u16` +/// to correct samples loaded from a wire-encoded buffer. +/// +/// Mirrors PR #82's `9c7d533` dispatcher routing fix and PR #85's +/// `9e678b0` Ya16 SIMD gate — both addressed the same bug class +/// (only swapping on `BE = true` rather than `BE != HOST_NATIVE_BE`). +#[inline(always)] +pub(super) unsafe fn bswap_u16x8_if_be(v: uint16x8_t) -> uint16x8_t { + if BE != HOST_NATIVE_BE { + unsafe { vreinterpretq_u16_u8(vrev16q_u8(vreinterpretq_u8_u16(v))) } + } else { + v + } +} + +/// Conditionally byte-swap 4 u32 lanes in a NEON register so that the +/// returned value is in **host-native** byte order, regardless of the +/// host endianness. +/// +/// Same `BE != HOST_NATIVE_BE` gate as [`bswap_u16x8_if_be`] — see that +/// helper for the truth table. +/// +/// Used by the V410 kernel after `vld1q_u32` to correct u32 words loaded +/// from a wire-encoded buffer. +#[inline(always)] +pub(super) unsafe fn bswap_u32x4_if_be(v: uint32x4_t) -> uint32x4_t { + if BE != HOST_NATIVE_BE { + unsafe { vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(v))) } + } else { + v + } +} + #[cfg(all(test, feature = "std"))] mod tests; diff --git a/src/row/arch/neon/tests/ayuv64.rs b/src/row/arch/neon/tests/ayuv64.rs index 84c02944..6f5938fe 100644 --- a/src/row/arch/neon/tests/ayuv64.rs +++ b/src/row/arch/neon/tests/ayuv64.rs @@ -22,9 +22,11 @@ fn check_rgb( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::ayuv64_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::ayuv64_to_rgb_or_rgba_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -43,11 +45,13 @@ fn check_rgb_u16( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( &p, &mut s, width, matrix, full_range, ); unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &p, &mut k, width, matrix, full_range, + ); } assert_eq!( s, @@ -61,9 +65,9 @@ fn check_luma(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::ayuv64_to_luma_row(&p, &mut s, width); + scalar::ayuv64_to_luma_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_row(&p, &mut k, width); + ayuv64_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON ayuv64→luma diverges (width={width})"); } @@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::ayuv64_to_luma_u16_row(&p, &mut s, width); + scalar::ayuv64_to_luma_u16_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_u16_row(&p, &mut k, width); + ayuv64_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON ayuv64→luma u16 diverges (width={width})"); } @@ -154,7 +158,7 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() { // --- luma_u16 path: Y values should be direct (no conversion). --- let mut luma_out = std::vec![0u16; W]; unsafe { - ayuv64_to_luma_u16_row(&packed, &mut luma_out, W); + ayuv64_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=16).map(|n| n as u16).collect(); assert_eq!( @@ -167,7 +171,7 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() { // produces a well-defined Y output. Matrix doesn't matter for neutral chroma. let mut rgba_out = std::vec![0u16; W * 4]; unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::( + ayuv64_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgba_out, W, @@ -183,3 +187,169 @@ fn neon_ayuv64_lane_order_per_pixel_y_and_a() { "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}" ); } + +/// SIMD-level BE-vs-LE parity test: probes the `bswap_u16x8_if_be` gate +/// added in `b7fb9d3` (PR #86) at the SIMD layer for AYUV64. +/// +/// Covers all four valid `(ALPHA, ALPHA_SRC)` quadrant subsets used by the +/// public API: (false,false) and (true,true). Source-α paths route the α +/// channel directly through the SIMD endian gate, so this also covers the +/// source-α-specific code path. +/// +/// On an LE host: +/// - SIMD `<…BE=false>` on LE input → no-swap path (gate doesn't fire). +/// - SIMD `<…BE=true>` on BE input → swap path (gate fires). +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_ayuv64_be_le_simd_parity() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_ayuv64(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + // u8 RGB (ALPHA=false, ALPHA_SRC=false) + { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w}) — endian gate broken" + ); + } + + // u8 RGBA + source α (ALPHA=true, ALPHA_SRC=true) — exercises the + // source-α path through the endian gate. + { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w}) — endian gate broken" + ); + } + + // u16 RGB + { + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w}) — endian gate broken" + ); + } + + // u16 RGBA + source α + { + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w}) — endian gate broken" + ); + } + + // luma u8 + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + ayuv64_to_luma_row::(&le, &mut out_le, w); + ayuv64_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken" + ); + } + + // luma u16 + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + ayuv64_to_luma_u16_row::(&le, &mut out_le, w); + ayuv64_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken" + ); + } + } +} diff --git a/src/row/arch/neon/tests/v410.rs b/src/row/arch/neon/tests/v410.rs index 1da5f45d..1d5fbb1c 100644 --- a/src/row/arch/neon/tests/v410.rs +++ b/src/row/arch/neon/tests/v410.rs @@ -27,9 +27,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -44,9 +44,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -60,9 +60,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v410_to_luma_row(&p, &mut s, width); + scalar::v410_to_luma_row::(&p, &mut s, width); unsafe { - v410_to_luma_row(&p, &mut k, width); + v410_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v410→luma diverges (width={width})"); } @@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v410_to_luma_u16_row(&p, &mut s, width); + scalar::v410_to_luma_u16_row::(&p, &mut s, width); unsafe { - v410_to_luma_u16_row(&p, &mut k, width); + v410_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON v410→luma u16 diverges (width={width})"); } @@ -158,7 +158,7 @@ fn neon_v410_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v410_to_luma_u16_row(&packed, &mut luma, W); + v410_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "neon v410 luma reorder bug"); @@ -167,9 +167,15 @@ fn neon_v410_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v410_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v410_to_rgb_or_rgba_row::( + scalar::v410_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, @@ -181,3 +187,129 @@ fn neon_v410_lane_order_per_pixel_y_and_u() { "neon v410 SIMD vs scalar diverges — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test: probes the `bswap_u32x4_if_be` gate +/// added in `b7fb9d3` (PR #86) at the SIMD layer, which existing tests miss +/// (per-backend tests use `BE=false`; dispatcher BE-vs-LE comparisons use +/// `use_simd=false`). +/// +/// On an LE host: +/// - SIMD `` on LE input → gate doesn't fire → exercises no-swap path. +/// - SIMD `` on BE input → gate fires → exercises swap path. +/// +/// On a BE host (s390x QEMU when Phase 3 lands), the same test exercises the +/// opposite quadrant. +/// +/// Widths chosen to cover the SIMD main loop (>=8 multiples) + scalar tail +/// (e.g. 17 = 16-lane main + 1 tail; 33 = 32-lane main + 1 tail). +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_v410_be_le_simd_parity() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_v410(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + + // u8 RGB / RGBA + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "neon v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w}) — endian gate broken" + ); + } + + // u16 RGB / RGBA + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "neon v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w}) — endian gate broken" + ); + } + + // luma u8 + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + v410_to_luma_row::(&le, &mut out_le, w); + v410_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon v410 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken" + ); + } + + // luma u16 + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + v410_to_luma_u16_row::(&le, &mut out_le, w); + v410_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon v410 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken" + ); + } + } +} diff --git a/src/row/arch/neon/tests/xv36.rs b/src/row/arch/neon/tests/xv36.rs index b254da48..b1e216bc 100644 --- a/src/row/arch/neon/tests/xv36.rs +++ b/src/row/arch/neon/tests/xv36.rs @@ -17,9 +17,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -34,9 +34,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -50,9 +50,9 @@ fn check_luma(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::xv36_to_luma_row(&p, &mut s, width); + scalar::xv36_to_luma_row::(&p, &mut s, width); unsafe { - xv36_to_luma_row(&p, &mut k, width); + xv36_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON xv36→luma diverges (width={width})"); } @@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::xv36_to_luma_u16_row(&p, &mut s, width); + scalar::xv36_to_luma_u16_row::(&p, &mut s, width); unsafe { - xv36_to_luma_u16_row(&p, &mut k, width); + xv36_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "NEON xv36→luma u16 diverges (width={width})"); } @@ -154,7 +154,7 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1) let mut luma_u16 = std::vec![0u16; W]; unsafe { - xv36_to_luma_u16_row(&packed, &mut luma_u16, W); + xv36_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "neon xv36 luma_u16 reorder bug"); @@ -163,9 +163,15 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -177,3 +183,124 @@ fn neon_xv36_lane_order_per_pixel_y_and_u() { "neon xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test: probes the `bswap_u16x8_if_be` gate +/// added in `b7fb9d3` (PR #86) at the SIMD layer. Existing per-backend tests +/// use `BE=false`; existing dispatcher BE-vs-LE tests use `use_simd=false`, +/// so the SIMD endian gate is otherwise untested. +/// +/// Builds an LE pseudo-random buffer, byte-swaps every u16 to obtain the +/// equivalent BE-encoded buffer, then asserts that: +/// SIMD(LE) == SIMD(BE) +/// for every output variant (u8 RGB/RGBA, u16 RGB/RGBA, luma u8/u16). +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_xv36_be_le_simd_parity() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_xv36(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + // u8 RGB / RGBA + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "neon xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w}) — endian gate broken" + ); + } + + // u16 RGB / RGBA + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "neon xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w}) — endian gate broken" + ); + } + + // luma u8 + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + xv36_to_luma_row::(&le, &mut out_le, w); + xv36_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon xv36 BE-vs-LE SIMD parity failed (luma u8, w={w}) — endian gate broken" + ); + } + + // luma u16 + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + xv36_to_luma_u16_row::(&le, &mut out_le, w); + xv36_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "neon xv36 BE-vs-LE SIMD parity failed (luma u16, w={w}) — endian gate broken" + ); + } + } +} diff --git a/src/row/arch/neon/v410.rs b/src/row/arch/neon/v410.rs index ec06d822..9d266750 100644 --- a/src/row/arch/neon/v410.rs +++ b/src/row/arch/neon/v410.rs @@ -16,6 +16,12 @@ //! for `chroma_i16x8` / `scale_y`. Only the low 4 lanes carry valid //! data; the high 4 are don't-care. //! +//! ## BE support (``) +//! +//! When `BE = true`, each loaded `uint32x4_t` is byte-swapped via +//! `bswap_u32x4_if_be::` before field extraction. The scalar tail +//! also forwards `BE`. +//! //! ## Tail //! //! `width % 4` remaining pixels fall through to `scalar::v410_*`. @@ -29,7 +35,7 @@ use crate::{ColorMatrix, row::scalar}; /// NEON V410 → packed u8 RGB or RGBA. /// -/// Byte-identical to `scalar::v410_to_rgb_or_rgba_row::`. +/// Byte-identical to `scalar::v410_to_rgb_or_rgba_row::`. /// /// # Safety /// @@ -38,7 +44,7 @@ use crate::{ColorMatrix, row::scalar}; /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v410_to_rgb_or_rgba_row( +pub(crate) unsafe fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -71,8 +77,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let mut x = 0usize; while x + 4 <= width { - // Load 4 V410 words. - let words = vld1q_u32(packed.as_ptr().add(x)); + // Load 4 V410 words; byte-swap each u32 for BE wire format. + let words = bswap_u32x4_if_be::(vld1q_u32(packed.as_ptr().add(x))); // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20). let u_u32 = vandq_u32(words, mask); @@ -140,7 +146,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -150,7 +162,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// NEON V410 → packed native-depth u16 RGB or RGBA (low-bit-packed at /// 10-bit). /// -/// Byte-identical to `scalar::v410_to_rgb_u16_or_rgba_u16_row::`. +/// Byte-identical to `scalar::v410_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -159,7 +171,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -196,7 +208,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let words = vld1q_u32(packed.as_ptr().add(x)); + let words = bswap_u32x4_if_be::(vld1q_u32(packed.as_ptr().add(x))); let u_u32 = vandq_u32(words, mask); let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask); @@ -253,7 +265,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_u16_or_rgba_u16_row::( + scalar::v410_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -268,7 +280,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// NEON V410 → u8 luma. Y is `(word >> 10) & 0x3FF`, then `>> 2`. /// -/// Byte-identical to `scalar::v410_to_luma_row`. +/// Byte-identical to `scalar::v410_to_luma_row::`. /// /// # Safety /// @@ -277,7 +289,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn v410_to_luma_row( + packed: &[u32], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -285,7 +301,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi let mask = vdupq_n_u32(0x3FF); let mut x = 0usize; while x + 4 <= width { - let words = vld1q_u32(packed.as_ptr().add(x)); + let words = bswap_u32x4_if_be::(vld1q_u32(packed.as_ptr().add(x))); // Y field: bits 19:10 → shift right 10, mask to 10-bit. let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask); // Narrow u32→u16, then >> 2, then narrow u16→u8. @@ -301,7 +317,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi x += 4; } if x < width { - scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_row::(&packed[x..width], &mut out[x..width], width - x); } } } @@ -310,7 +326,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// NEON V410 → u16 luma (low-bit-packed at 10-bit). /// -/// Byte-identical to `scalar::v410_to_luma_u16_row`. +/// Byte-identical to `scalar::v410_to_luma_u16_row::`. /// /// # Safety /// @@ -319,7 +335,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn v410_to_luma_u16_row( + packed: &[u32], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -327,7 +347,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width let mask = vdupq_n_u32(0x3FF); let mut x = 0usize; while x + 4 <= width { - let words = vld1q_u32(packed.as_ptr().add(x)); + let words = bswap_u32x4_if_be::(vld1q_u32(packed.as_ptr().add(x))); let y_u32 = vandq_u32(vshrq_n_u32::<10>(words), mask); // Narrow u32→u16 (values ≤ 1023, no saturation needed). let y_u16 = vmovn_u32(y_u32); @@ -338,7 +358,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width x += 4; } if x < width { - scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_u16_row::(&packed[x..width], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/neon/xv36.rs b/src/row/arch/neon/xv36.rs index 7ab03379..6e7b31ed 100644 --- a/src/row/arch/neon/xv36.rs +++ b/src/row/arch/neon/xv36.rs @@ -18,6 +18,10 @@ //! fit in i16, so `scale_y` is used (not `scale_y_u16_to_i16`). //! The Q15 pipeline uses i32 chroma (`chroma_i16x8`) at BITS=12. //! +//! For BE wire format (`BE = true`), each deinterleaved `uint16x8_t` +//! channel is byte-swapped via `bswap_u16x8_if_be::` after the +//! `vld4q_u16` call. +//! //! ## Tail //! //! `width % 8` remaining pixels fall through to `scalar::xv36_*`. @@ -31,7 +35,7 @@ use crate::{ColorMatrix, row::scalar}; /// NEON XV36 → packed u8 RGB or RGBA. /// -/// Byte-identical to `scalar::xv36_to_rgb_or_rgba_row::`. +/// Byte-identical to `scalar::xv36_to_rgb_or_rgba_row::`. /// /// # Safety /// @@ -40,7 +44,7 @@ use crate::{ColorMatrix, row::scalar}; /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( +pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -74,11 +78,16 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( // Load 8 XV36 quadruples (8 × 4 × u16 = 64 bytes). // vld4q_u16 deinterleaves: .0=U8, .1=Y8, .2=V8, .3=A8 (padding). let q = vld4q_u16(packed.as_ptr().add(x * 4)); + // Apply BE byte-swap per-channel if needed. + let u_raw = bswap_u16x8_if_be::(q.0); + let y_raw = bswap_u16x8_if_be::(q.1); + let v_raw = bswap_u16x8_if_be::(q.2); + // q.3 (A) is padding — discarded (no swap needed). + // Right-shift by 4 to drop the 4 padding LSBs → 12-bit range [0, 4095]. - let u_u16 = vshrq_n_u16::<4>(q.0); // 8 lanes of U - let y_u16 = vshrq_n_u16::<4>(q.1); // 8 lanes of Y - let v_u16 = vshrq_n_u16::<4>(q.2); // 8 lanes of V - // q.3 (A) is padding — discarded. + let u_u16 = vshrq_n_u16::<4>(u_raw); // 8 lanes of U + let y_u16 = vshrq_n_u16::<4>(y_raw); // 8 lanes of Y + let v_u16 = vshrq_n_u16::<4>(v_raw); // 8 lanes of V // Reinterpret as signed i16 (values ≤ 4095 < 32767, safe). let u_i16 = vreinterpretq_s16_u16(u_u16); @@ -133,7 +142,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -143,7 +158,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// NEON XV36 → packed native-depth u16 RGB or RGBA (low-bit-packed at /// 12-bit). /// -/// Byte-identical to `scalar::xv36_to_rgb_u16_or_rgba_u16_row::`. +/// Byte-identical to `scalar::xv36_to_rgb_u16_or_rgba_u16_row::`. /// /// # Safety /// @@ -152,7 +167,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -190,11 +205,15 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { let q = vld4q_u16(packed.as_ptr().add(x * 4)); - let u_u16 = vshrq_n_u16::<4>(q.0); - let y_u16 = vshrq_n_u16::<4>(q.1); - let v_u16 = vshrq_n_u16::<4>(q.2); + let u_raw = bswap_u16x8_if_be::(q.0); + let y_raw = bswap_u16x8_if_be::(q.1); + let v_raw = bswap_u16x8_if_be::(q.2); // q.3 (A) is padding — discarded. + let u_u16 = vshrq_n_u16::<4>(u_raw); + let y_u16 = vshrq_n_u16::<4>(y_raw); + let v_u16 = vshrq_n_u16::<4>(v_raw); + let u_i16 = vreinterpretq_s16_u16(u_u16); let y_i16 = vreinterpretq_s16_u16(y_u16); let v_i16 = vreinterpretq_s16_u16(v_u16); @@ -239,7 +258,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -255,7 +274,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// NEON XV36 → u8 luma. Y is quadruple element 1; `>> 8` brings the /// 12-bit MSB-aligned sample to 8-bit (drops 4 padding LSBs + 4 more). /// -/// Byte-identical to `scalar::xv36_to_luma_row`. +/// Byte-identical to `scalar::xv36_to_luma_row::`. /// /// # Safety /// @@ -264,7 +283,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn xv36_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -272,15 +295,17 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi let mut x = 0usize; while x + 8 <= width { let q = vld4q_u16(packed.as_ptr().add(x * 4)); - // Y is q.1. Scalar does `packed[x*4+1] >> 8`; apply the same shift. + // Y is q.1. Apply BE byte-swap if needed before the shift. + let y_raw = bswap_u16x8_if_be::(q.1); + // Scalar does `packed[x*4+1] >> 8`; apply the same shift. // vshrn_n_u16::<8> narrows (u16 >> 8) → u8x8, handling 8 lanes. - let y_u8 = vshrn_n_u16::<8>(q.1); + let y_u8 = vshrn_n_u16::<8>(y_raw); vst1_u8(out.as_mut_ptr().add(x), y_u8); x += 8; } // Scalar tail. if x < width { - scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } @@ -291,7 +316,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// element 1; `>> 4` drops the 4 padding LSBs to give a 12-bit value /// in `[0, 4095]`. /// -/// Byte-identical to `scalar::xv36_to_luma_u16_row`. +/// Byte-identical to `scalar::xv36_to_luma_u16_row::`. /// /// # Safety /// @@ -300,7 +325,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn xv36_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -308,14 +337,15 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width let mut x = 0usize; while x + 8 <= width { let q = vld4q_u16(packed.as_ptr().add(x * 4)); - // Y is q.1. Scalar does `packed[x*4+1] >> 4`. - let y_u16 = vshrq_n_u16::<4>(q.1); + // Y is q.1. Apply BE byte-swap if needed, then `>> 4`. + let y_raw = bswap_u16x8_if_be::(q.1); + let y_u16 = vshrq_n_u16::<4>(y_raw); vst1q_u16(out.as_mut_ptr().add(x), y_u16); x += 8; } // Scalar tail. if x < width { - scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_u16_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/wasm_simd128/ayuv64.rs b/src/row/arch/wasm_simd128/ayuv64.rs index 04465dfc..e10f01c1 100644 --- a/src/row/arch/wasm_simd128/ayuv64.rs +++ b/src/row/arch/wasm_simd128/ayuv64.rs @@ -44,7 +44,7 @@ use core::arch::wasm32::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Deinterleave helper ------------------------------------------------ @@ -67,13 +67,14 @@ use crate::{ColorMatrix, row::scalar}; /// `simd128` must be enabled at compile time. #[inline] #[target_feature(enable = "simd128")] -unsafe fn deinterleave_ayuv64_8px(ptr: *const u16) -> (v128, v128, v128, v128) { +unsafe fn deinterleave_ayuv64_8px(ptr: *const u16) -> (v128, v128, v128, v128) { unsafe { // Load 4 × v128, each covering 2 pixels (8 × u16 = 16 bytes). - let raw0 = v128_load(ptr.cast()); // [A0,Y0,U0,V0, A1,Y1,U1,V1] - let raw1 = v128_load(ptr.add(8).cast()); // [A2,Y2,U2,V2, A3,Y3,U3,V3] - let raw2 = v128_load(ptr.add(16).cast()); // [A4,Y4,U4,V4, A5,Y5,U5,V5] - let raw3 = v128_load(ptr.add(24).cast()); // [A6,Y6,U6,V6, A7,Y7,U7,V7] + // For BE wire format, `load_endian_u16x8` byte-swaps each u16 lane. + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); // [A0,Y0,U0,V0, A1,Y1,U1,V1] + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); // [A2,Y2,U2,V2, A3,Y3,U3,V3] + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); // [A4,Y4,U4,V4, A5,Y5,U5,V5] + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); // [A6,Y6,U6,V6, A7,Y7,U7,V7] // Per-channel byte positions within a 2-pixel v128 (16 bytes): // A → bytes 0,1 (pixel n) and 8,9 (pixel n+1) @@ -148,7 +149,11 @@ unsafe fn deinterleave_ayuv64_8px(ptr: *const u16) -> (v128, v128, v128, v128) { /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -184,7 +189,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4)); // Center chroma via wrapping i16 subtraction. let u_lo_i16 = i16x8_sub(u_lo_u16, bias16_v); @@ -216,7 +221,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4 + 32)); let u_hi_i16 = i16x8_sub(u_hi_u16, bias16_v); let v_hi_i16 = i16x8_sub(v_hi_u16, bias16_v); @@ -278,7 +283,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( + scalar::ayuv64_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -300,7 +305,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row= width * 3`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgb_row( +pub(crate) unsafe fn ayuv64_to_rgb_row( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -308,7 +313,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } } @@ -322,7 +327,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( /// 3. `out.len() >= width * 4`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -330,7 +335,7 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } } @@ -355,7 +360,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -393,7 +402,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row(packed.as_ptr().add(x * 4)); // Center chroma via wrapping i16 subtraction. let u_i16 = i16x8_sub(u_u16, bias16); @@ -483,7 +492,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -505,7 +514,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row= width * 3` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -513,7 +522,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); } } @@ -527,7 +538,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( /// 3. `out.len() >= width * 4` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -535,7 +546,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); } } @@ -555,7 +568,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -563,8 +580,9 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid let mut x = 0usize; while x + 16 <= width { // Two deinterleaves for 8 pixels each. - let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4)); - let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4 + 32)); + let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px::(packed.as_ptr().add(x * 4)); + let (_a_hi, y_hi, _u_hi, _v_hi) = + deinterleave_ayuv64_8px::(packed.as_ptr().add(x * 4 + 32)); // >> 8 to get u8 luma (high byte of each Y u16 sample). // Logical shift (u16x8_shr) — arithmetic shift (i16x8_shr) would @@ -581,7 +599,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid // Scalar tail. if x < width { - scalar::ayuv64_to_luma_row( + scalar::ayuv64_to_luma_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, @@ -604,7 +622,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -612,8 +634,9 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] let mut x = 0usize; while x + 16 <= width { // Two deinterleaves for 8 pixels each. - let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4)); - let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64_8px(packed.as_ptr().add(x * 4 + 32)); + let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64_8px::(packed.as_ptr().add(x * 4)); + let (_a_hi, y_hi, _u_hi, _v_hi) = + deinterleave_ayuv64_8px::(packed.as_ptr().add(x * 4 + 32)); // Direct copy — Y samples are 16-bit native (no shift needed). v128_store(luma_out.as_mut_ptr().add(x).cast(), y_lo); @@ -624,7 +647,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] // Scalar tail. if x < width { - scalar::ayuv64_to_luma_u16_row( + scalar::ayuv64_to_luma_u16_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, diff --git a/src/row/arch/wasm_simd128/tests/ayuv64.rs b/src/row/arch/wasm_simd128/tests/ayuv64.rs index cb42ec13..3521aafc 100644 --- a/src/row/arch/wasm_simd128/tests/ayuv64.rs +++ b/src/row/arch/wasm_simd128/tests/ayuv64.rs @@ -21,9 +21,11 @@ fn check_rgb( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::ayuv64_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::ayuv64_to_rgb_or_rgba_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -42,11 +44,13 @@ fn check_rgb_u16( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( &p, &mut s, width, matrix, full_range, ); unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &p, &mut k, width, matrix, full_range, + ); } assert_eq!( s, @@ -60,9 +64,9 @@ fn check_luma(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::ayuv64_to_luma_row(&p, &mut s, width); + scalar::ayuv64_to_luma_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_row(&p, &mut k, width); + ayuv64_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "wasm ayuv64→luma diverges (width={width})"); } @@ -71,9 +75,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::ayuv64_to_luma_u16_row(&p, &mut s, width); + scalar::ayuv64_to_luma_u16_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_u16_row(&p, &mut k, width); + ayuv64_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "wasm ayuv64→luma u16 diverges (width={width})"); } @@ -150,7 +154,7 @@ fn wasm_ayuv64_lane_order_high_bit_set_values() { // luma u8 high-byte extraction: 0x8001 >> 8 = 0x80 for every pixel let mut luma_u8 = std::vec![0u8; W]; unsafe { - ayuv64_to_luma_row(&packed, &mut luma_u8, W); + ayuv64_to_luma_row::(&packed, &mut luma_u8, W); } let expected_luma: std::vec::Vec = std::vec![0x80; W]; assert_eq!( @@ -161,7 +165,13 @@ fn wasm_ayuv64_lane_order_high_bit_set_values() { // u8 RGBA α depth-convert: 0x8000+n >> 8 = 0x80 for n in 0..16 (since n < 256) let mut rgba_u8 = std::vec![0u8; W * 4]; unsafe { - ayuv64_to_rgb_or_rgba_row::(&packed, &mut rgba_u8, W, ColorMatrix::Bt709, true); + ayuv64_to_rgb_or_rgba_row::( + &packed, + &mut rgba_u8, + W, + ColorMatrix::Bt709, + true, + ); } let alpha_out: std::vec::Vec = (0..W).map(|n| rgba_u8[n * 4 + 3]).collect(); let expected_alpha: std::vec::Vec = std::vec![0x80; W]; @@ -206,7 +216,7 @@ fn wasm_ayuv64_lane_order_per_pixel_y_and_a() { // --- luma_u16 path: Y values should be direct (no shift, no conversion). --- let mut luma_out = std::vec![0u16; W]; unsafe { - ayuv64_to_luma_u16_row(&packed, &mut luma_out, W); + ayuv64_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=16u16).collect(); assert_eq!(luma_out, expected_luma, "wasm ayuv64→luma_u16 reorder bug"); @@ -215,7 +225,7 @@ fn wasm_ayuv64_lane_order_per_pixel_y_and_a() { // Use full_range=true so neutral chroma gives a well-defined Y output. let mut rgba_out = std::vec![0u16; W * 4]; unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::( + ayuv64_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgba_out, W, diff --git a/src/row/arch/wasm_simd128/tests/v410.rs b/src/row/arch/wasm_simd128/tests/v410.rs index 0c5310a4..30f2272e 100644 --- a/src/row/arch/wasm_simd128/tests/v410.rs +++ b/src/row/arch/wasm_simd128/tests/v410.rs @@ -24,9 +24,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -38,9 +38,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -52,9 +52,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -66,9 +66,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -80,9 +80,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v410_words(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v410_to_luma_row(&p, &mut s, width); + scalar::v410_to_luma_row::(&p, &mut s, width); unsafe { - v410_to_luma_row(&p, &mut k, width); + v410_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v410→luma diverges (width={width})"); } @@ -91,9 +91,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v410_words(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v410_to_luma_u16_row(&p, &mut s, width); + scalar::v410_to_luma_u16_row::(&p, &mut s, width); unsafe { - v410_to_luma_u16_row(&p, &mut k, width); + v410_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "simd128 v410→luma u16 diverges (width={width})"); } @@ -197,7 +197,7 @@ fn wasm_simd128_v410_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma_out = std::vec![0u16; W]; unsafe { - v410_to_luma_u16_row(&packed, &mut luma_out, W); + v410_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -209,9 +209,15 @@ fn wasm_simd128_v410_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v410_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v410_to_rgb_or_rgba_row::( + scalar::v410_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/tests/xv36.rs b/src/row/arch/wasm_simd128/tests/xv36.rs index dd8375ca..06f53ea4 100644 --- a/src/row/arch/wasm_simd128/tests/xv36.rs +++ b/src/row/arch/wasm_simd128/tests/xv36.rs @@ -17,9 +17,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -34,9 +34,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -50,9 +50,9 @@ fn check_luma(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::xv36_to_luma_row(&p, &mut s, width); + scalar::xv36_to_luma_row::(&p, &mut s, width); unsafe { - xv36_to_luma_row(&p, &mut k, width); + xv36_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "wasm xv36→luma diverges (width={width})"); } @@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::xv36_to_luma_u16_row(&p, &mut s, width); + scalar::xv36_to_luma_u16_row::(&p, &mut s, width); unsafe { - xv36_to_luma_u16_row(&p, &mut k, width); + xv36_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "wasm xv36→luma u16 diverges (width={width})"); } @@ -171,7 +171,7 @@ fn wasm_simd128_xv36_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1) let mut luma_u16 = std::vec![0u16; W]; unsafe { - xv36_to_luma_u16_row(&packed, &mut luma_u16, W); + xv36_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -183,9 +183,15 @@ fn wasm_simd128_xv36_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, diff --git a/src/row/arch/wasm_simd128/v410.rs b/src/row/arch/wasm_simd128/v410.rs index 24dead51..aa135e78 100644 --- a/src/row/arch/wasm_simd128/v410.rs +++ b/src/row/arch/wasm_simd128/v410.rs @@ -22,7 +22,7 @@ use core::arch::wasm32::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- u8 RGB / RGBA output ----------------------------------------------- @@ -38,7 +38,7 @@ use crate::{ColorMatrix, row::scalar}; /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v410_to_rgb_or_rgba_row( +pub(crate) unsafe fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -71,8 +71,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let mut x = 0usize; while x + 4 <= width { - // Load 4 V410 words. - let words = v128_load(packed.as_ptr().add(x).cast()); + // Load 4 V410 words (with BE byte-swap if required). + let words = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20). let u_i32 = v128_and(words, mask); @@ -152,7 +152,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -171,7 +177,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -208,7 +214,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 4 <= width { - let words = v128_load(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); let u_i32 = v128_and(words, mask); let y_i32 = v128_and(u32x4_shr(words, 10), mask); @@ -276,7 +282,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_u16_or_rgba_u16_row::( + scalar::v410_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -300,7 +306,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn v410_to_luma_row( + packed: &[u32], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -310,7 +320,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi let mut x = 0usize; while x + 4 <= width { - let words = v128_load(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); // Y field: bits 19:10 → shift right 10, mask to 10-bit. let y_i32 = v128_and(u32x4_shr(words, 10), mask); // Narrow i32x4 → i16x8 (4 valid lo lanes + 4 zero hi lanes). @@ -327,7 +337,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi // Scalar tail — remaining < 4 pixels. if x < width { - scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_row::(&packed[x..width], &mut out[x..width], width - x); } } } @@ -345,7 +355,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn v410_to_luma_u16_row( + packed: &[u32], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -355,7 +369,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width let mut x = 0usize; while x + 4 <= width { - let words = v128_load(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); let y_i32 = v128_and(u32x4_shr(words, 10), mask); // Narrow i32x4 → i16x8: 4 valid lo lanes (values ≤ 1023, no saturation). let y_i16 = i16x8_narrow_i32x4(y_i32, zero4); @@ -368,7 +382,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width // Scalar tail. if x < width { - scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_u16_row::(&packed[x..width], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/wasm_simd128/xv36.rs b/src/row/arch/wasm_simd128/xv36.rs index f34bde20..5e4770e1 100644 --- a/src/row/arch/wasm_simd128/xv36.rs +++ b/src/row/arch/wasm_simd128/xv36.rs @@ -39,7 +39,7 @@ use core::arch::wasm32::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; /// Deinterleave 8 XV36 pixels (4 × v128 = 64 bytes) into separate @@ -55,13 +55,14 @@ use crate::{ColorMatrix, row::scalar}; /// caller must `u16x8_shr(v, 4)` to drop the 4 padding LSBs. #[inline] #[target_feature(enable = "simd128")] -unsafe fn deinterleave_xv36_8px(ptr: *const u16) -> (v128, v128, v128) { +unsafe fn deinterleave_xv36_8px(ptr: *const u16) -> (v128, v128, v128) { unsafe { // Load 4 × v128, each covering 2 pixels. - let raw0 = v128_load(ptr.cast()); // [U0,Y0,V0,A0, U1,Y1,V1,A1] - let raw1 = v128_load(ptr.add(8).cast()); // [U2,Y2,V2,A2, U3,Y3,V3,A3] - let raw2 = v128_load(ptr.add(16).cast()); // [U4,Y4,V4,A4, U5,Y5,V5,A5] - let raw3 = v128_load(ptr.add(24).cast()); // [U6,Y6,V6,A6, U7,Y7,V7,A7] + // For BE wire format, `load_endian_u16x8` byte-swaps each u16 lane. + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); // [U0,Y0,V0,A0, U1,Y1,V1,A1] + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); // [U2,Y2,V2,A2, U3,Y3,V3,A3] + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); // [U4,Y4,V4,A4, U5,Y5,V5,A5] + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); // [U6,Y6,V6,A6, U7,Y7,V7,A7] // Per-channel byte positions inside a 2-pixel v128: // U → bytes 0,1 (pixel n) and 8,9 (pixel n+1) @@ -137,7 +138,7 @@ unsafe fn deinterleave_xv36_8px(ptr: *const u16) -> (v128, v128, v128) { /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( +pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -169,7 +170,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let mut x = 0usize; while x + 8 <= width { // Deinterleave 8 XV36 pixels (64 bytes) into U/Y/V channels. - let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px(packed.as_ptr().add(x * 4)); + let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px::(packed.as_ptr().add(x * 4)); // Right-shift by 4 to drop the 4 padding LSBs → 12-bit [0, 4095]. // Values ≤ 4095 fit safely in i16. @@ -241,7 +242,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -261,7 +268,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -296,7 +303,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px(packed.as_ptr().add(x * 4)); + let (u_raw, y_raw, v_raw) = deinterleave_xv36_8px::(packed.as_ptr().add(x * 4)); let u_i16 = u16x8_shr(u_raw, 4); let y_i16 = u16x8_shr(y_raw, 4); @@ -359,7 +366,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -384,7 +391,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn xv36_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -396,10 +407,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi let mut x = 0usize; while x + 8 <= width { let ptr = packed.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); // pixels 0,1 - let raw1 = v128_load(ptr.add(8).cast()); // pixels 2,3 - let raw2 = v128_load(ptr.add(16).cast()); // pixels 4,5 - let raw3 = v128_load(ptr.add(24).cast()); // pixels 6,7 + // For BE wire format, byte-swap each u16 before extracting Y via swizzle. + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); // pixels 0,1 + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); // pixels 2,3 + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); // pixels 4,5 + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); // pixels 6,7 // Extract Y from each pair → 2 u16 in low 4 bytes. let y0 = u8x16_swizzle(raw0, y_idx); // [Y0,Y1, 0..12] @@ -425,7 +437,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // Scalar tail. if x < width { - scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } @@ -445,7 +457,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn xv36_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -455,10 +471,10 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width let mut x = 0usize; while x + 8 <= width { let ptr = packed.as_ptr().add(x * 4); - let raw0 = v128_load(ptr.cast()); - let raw1 = v128_load(ptr.add(8).cast()); - let raw2 = v128_load(ptr.add(16).cast()); - let raw3 = v128_load(ptr.add(24).cast()); + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); let y0 = u8x16_swizzle(raw0, y_idx); let y1 = u8x16_swizzle(raw1, y_idx); @@ -477,7 +493,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width // Scalar tail. if x < width { - scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_u16_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_avx2/ayuv64.rs b/src/row/arch/x86_avx2/ayuv64.rs index c052f9c1..d3616d10 100644 --- a/src/row/arch/x86_avx2/ayuv64.rs +++ b/src/row/arch/x86_avx2/ayuv64.rs @@ -73,7 +73,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Deinterleave helper (16 pixels / 64 u16 / 128 bytes) --------------- @@ -112,7 +112,9 @@ use crate::{ColorMatrix, row::scalar}; /// elements). Caller's `target_feature` must include AVX2. #[inline] #[target_feature(enable = "avx2")] -unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i, __m256i) { +unsafe fn deinterleave_ayuv64_16px_avx2( + ptr: *const u16, +) -> (__m256i, __m256i, __m256i, __m256i) { // SAFETY: caller obligation — `ptr` has 128 bytes readable; AVX2 is // available. unsafe { @@ -123,10 +125,12 @@ unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, _ // raw_c1 lo=A4..V4,A5..V5 hi=A6..V6,A7..V7 (pixels 4..7) // raw_c2 lo=A8..V8,A9..V9 hi=A10..V10,A11..V11 (pixels 8..11) // raw_c3 lo=A12..V12,A13..V13 hi=A14..V14,A15..V15 (pixels 12..15) - let raw_c0 = _mm256_loadu_si256(ptr.cast()); - let raw_c1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw_c2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw_c3 = _mm256_loadu_si256(ptr.add(48).cast()); + // + // For BE wire format, `load_endian_u16x16` byte-swaps each u16 lane. + let raw_c0 = endian::load_endian_u16x16::(ptr as *const u8); + let raw_c1 = endian::load_endian_u16x16::(ptr.add(16) as *const u8); + let raw_c2 = endian::load_endian_u16x16::(ptr.add(32) as *const u8); + let raw_c3 = endian::load_endian_u16x16::(ptr.add(48) as *const u8); // Reshape via cross-lane permute so each register holds the layout // the per-128-bit-lane cascade below expects: @@ -206,7 +210,11 @@ unsafe fn deinterleave_ayuv64_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, _ /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -246,7 +254,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4)); // Center chroma: subtract 32768 via wrapping i16 (-32768i16 == 0x8000). let u_lo_i16 = _mm256_sub_epi16(u_lo_u16, bias16_v); @@ -286,7 +294,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4 + 64)); let u_hi_i16 = _mm256_sub_epi16(u_hi_u16, bias16_v); let v_hi_i16 = _mm256_sub_epi16(v_hi_u16, bias16_v); @@ -362,7 +370,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( + scalar::ayuv64_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -395,7 +403,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -433,7 +445,8 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row(packed.as_ptr().add(x * 4)); // Center chroma via wrapping i16 subtraction. let u_i16 = _mm256_sub_epi16(u_u16, bias16_v); @@ -569,7 +582,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -585,7 +598,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -593,7 +606,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } } @@ -601,7 +614,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( /// to u8 via `>> 8`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -609,14 +622,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } } /// AVX2 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -624,7 +637,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); } } @@ -632,7 +647,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( /// is written direct (no conversion). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -640,7 +655,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); } } @@ -664,7 +681,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -673,7 +694,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid let mut x = 0usize; while x + 16 <= width { // Deinterleave 16 pixels and discard A/U/V. - let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4)); + let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2::(packed.as_ptr().add(x * 4)); // y_vec lo lane = [Y0..Y7], hi lane = [Y8..Y15] (16 u16 in natural order). // `>> 8` → high byte of each Y u16. Then narrow to u8. @@ -693,7 +714,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid // Scalar tail. if x < width { - scalar::ayuv64_to_luma_row( + scalar::ayuv64_to_luma_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, @@ -719,7 +740,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -727,7 +752,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] unsafe { let mut x = 0usize; while x + 16 <= width { - let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2(packed.as_ptr().add(x * 4)); + let (_a, y_vec, _u, _v) = deinterleave_ayuv64_16px_avx2::(packed.as_ptr().add(x * 4)); // Direct store — Y samples are 16-bit native, in natural pixel order. _mm256_storeu_si256(luma_out.as_mut_ptr().add(x).cast(), y_vec); x += 16; @@ -735,7 +760,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] // Scalar tail. if x < width { - scalar::ayuv64_to_luma_u16_row( + scalar::ayuv64_to_luma_u16_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, diff --git a/src/row/arch/x86_avx2/tests/ayuv64.rs b/src/row/arch/x86_avx2/tests/ayuv64.rs index c9f89115..7344ddd9 100644 --- a/src/row/arch/x86_avx2/tests/ayuv64.rs +++ b/src/row/arch/x86_avx2/tests/ayuv64.rs @@ -22,9 +22,11 @@ fn check_rgb( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::ayuv64_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::ayuv64_to_rgb_or_rgba_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -43,11 +45,13 @@ fn check_rgb_u16( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( &p, &mut s, width, matrix, full_range, ); unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &p, &mut k, width, matrix, full_range, + ); } assert_eq!( s, @@ -61,9 +65,9 @@ fn check_luma(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::ayuv64_to_luma_row(&p, &mut s, width); + scalar::ayuv64_to_luma_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_row(&p, &mut k, width); + ayuv64_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 ayuv64→luma diverges (width={width})"); } @@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::ayuv64_to_luma_u16_row(&p, &mut s, width); + scalar::ayuv64_to_luma_u16_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_u16_row(&p, &mut k, width); + ayuv64_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 ayuv64→luma u16 diverges (width={width})"); } @@ -179,7 +183,7 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() { // --- luma_u16 path: Y values should be direct (no conversion). --- let mut luma_out = std::vec![0u16; W]; unsafe { - ayuv64_to_luma_u16_row(&packed, &mut luma_out, W); + ayuv64_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -192,7 +196,7 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() { // a well-defined Y output. Matrix choice does not affect neutral chroma. let mut rgba_out = std::vec![0u16; W * 4]; unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::( + ayuv64_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgba_out, W, @@ -208,3 +212,160 @@ fn avx2_ayuv64_lane_order_per_pixel_y_and_a() { "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}" ); } + +/// SIMD-level BE-vs-LE parity test for AYUV64 — exercises the host-aware +/// endian gate via `endian::load_endian_u16x*::` and covers the +/// source-α path explicitly via `(ALPHA=true, ALPHA_SRC=true)`. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_ayuv64_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_ayuv64(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + ayuv64_to_luma_row::(&le, &mut out_le, w); + ayuv64_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + ayuv64_to_luma_u16_row::(&le, &mut out_le, w); + ayuv64_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx2/tests/v410.rs b/src/row/arch/x86_avx2/tests/v410.rs index 3acec1aa..a57e0d70 100644 --- a/src/row/arch/x86_avx2/tests/v410.rs +++ b/src/row/arch/x86_avx2/tests/v410.rs @@ -27,9 +27,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -44,9 +44,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -60,9 +60,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v410_to_luma_row(&p, &mut s, width); + scalar::v410_to_luma_row::(&p, &mut s, width); unsafe { - v410_to_luma_row(&p, &mut k, width); + v410_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v410→luma diverges (width={width})"); } @@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v410_to_luma_u16_row(&p, &mut s, width); + scalar::v410_to_luma_u16_row::(&p, &mut s, width); unsafe { - v410_to_luma_u16_row(&p, &mut k, width); + v410_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 v410→luma u16 diverges (width={width})"); } @@ -183,7 +183,7 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v410_to_luma_u16_row(&packed, &mut luma, W); + v410_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx2 v410 luma reorder bug"); @@ -192,9 +192,15 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v410_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v410_to_rgb_or_rgba_row::( + scalar::v410_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, @@ -206,3 +212,121 @@ fn avx2_v410_lane_order_per_pixel_y_and_u() { "avx2 v410 SIMD vs scalar diverges — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u32x*::`. Existing per-backend tests use +/// `BE=false` only; existing dispatcher BE-vs-LE tests use `use_simd=false`, +/// so the SIMD endian gate is otherwise untested. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_v410_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_v410(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "avx2 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "avx2 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + v410_to_luma_row::(&le, &mut out_le, w); + v410_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + v410_to_luma_u16_row::(&le, &mut out_le, w); + v410_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx2/tests/xv36.rs b/src/row/arch/x86_avx2/tests/xv36.rs index be5cc011..7b703951 100644 --- a/src/row/arch/x86_avx2/tests/xv36.rs +++ b/src/row/arch/x86_avx2/tests/xv36.rs @@ -17,9 +17,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -34,9 +34,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -50,9 +50,9 @@ fn check_luma(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::xv36_to_luma_row(&p, &mut s, width); + scalar::xv36_to_luma_row::(&p, &mut s, width); unsafe { - xv36_to_luma_row(&p, &mut k, width); + xv36_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 xv36→luma diverges (width={width})"); } @@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::xv36_to_luma_u16_row(&p, &mut s, width); + scalar::xv36_to_luma_u16_row::(&p, &mut s, width); unsafe { - xv36_to_luma_u16_row(&p, &mut k, width); + xv36_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX2 xv36→luma u16 diverges (width={width})"); } @@ -178,7 +178,7 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1) let mut luma_u16 = std::vec![0u16; W]; unsafe { - xv36_to_luma_u16_row(&packed, &mut luma_u16, W); + xv36_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "avx2 xv36 luma_u16 reorder bug"); @@ -187,9 +187,15 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -201,3 +207,120 @@ fn avx2_xv36_lane_order_per_pixel_y_and_u() { "avx2 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u16x*::` for AVX2. See sibling v410 test for +/// rationale. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx2_xv36_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_xv36(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "avx2 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "avx2 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + xv36_to_luma_row::(&le, &mut out_le, w); + xv36_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + xv36_to_luma_u16_row::(&le, &mut out_le, w); + xv36_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx2 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx2/v410.rs b/src/row/arch/x86_avx2/v410.rs index 90144637..934cad98 100644 --- a/src/row/arch/x86_avx2/v410.rs +++ b/src/row/arch/x86_avx2/v410.rs @@ -28,7 +28,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Bit-extraction helper ----------------------------------------------- @@ -47,11 +47,11 @@ use crate::{ColorMatrix, row::scalar}; /// that `target_feature` includes AVX2. #[inline] #[target_feature(enable = "avx2")] -unsafe fn unpack_v410_8px_avx2(ptr: *const u32) -> (__m256i, __m256i, __m256i) { +unsafe fn unpack_v410_8px_avx2(ptr: *const u32) -> (__m256i, __m256i, __m256i) { // SAFETY: caller obligation — `ptr` has 32 bytes readable; AVX2 is // available. unsafe { - let words = _mm256_loadu_si256(ptr.cast()); + let words = endian::load_endian_u32x8::(ptr as *const u8); let mask = _mm256_set1_epi32(0x3FF); // Extract 10-bit fields in i32x8 (values ≤ 1023 — no overflow risk). @@ -88,7 +88,7 @@ unsafe fn unpack_v410_8px_avx2(ptr: *const u32) -> (__m256i, __m256i, __m256i) { /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v410_to_rgb_or_rgba_row( +pub(crate) unsafe fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -121,7 +121,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let mut x = 0usize; while x + 8 <= width { // Unpack 8 V410 words → three i16x16 with valid data in lanes 0..7. - let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2(packed.as_ptr().add(x)); + let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2::(packed.as_ptr().add(x)); // Subtract chroma bias (512 for 10-bit). let u_sub = _mm256_sub_epi16(u_i16, bias_v); @@ -201,7 +201,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -222,7 +228,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -257,7 +263,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2(packed.as_ptr().add(x)); + let (u_i16, y_i16, v_i16) = unpack_v410_8px_avx2::(packed.as_ptr().add(x)); let u_sub = _mm256_sub_epi16(u_i16, bias_v); let v_sub = _mm256_sub_epi16(v_i16, bias_v); @@ -331,7 +337,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_u16_or_rgba_u16_row::( + scalar::v410_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -357,7 +363,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn v410_to_luma_row( + packed: &[u32], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -367,7 +377,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi let mut x = 0usize; while x + 8 <= width { - let words = _mm256_loadu_si256(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x8::(packed.as_ptr().add(x) as *const u8); // Y = (word >> 10) & 0x3FF for each i32 lane. let y_i32 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask); @@ -390,7 +400,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi // Scalar tail — remaining < 8 pixels. if x < width { - scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_row::(&packed[x..width], &mut out[x..width], width - x); } } } @@ -411,7 +421,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn v410_to_luma_u16_row( + packed: &[u32], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -421,7 +435,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width let mut x = 0usize; while x + 8 <= width { - let words = _mm256_loadu_si256(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x8::(packed.as_ptr().add(x) as *const u8); // Y = (word >> 10) & 0x3FF for each i32 lane. let y_i32 = _mm256_and_si256(_mm256_srli_epi32::<10>(words), mask); @@ -440,7 +454,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width // Scalar tail — remaining < 8 pixels. if x < width { - scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_u16_row::(&packed[x..width], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_avx2/xv36.rs b/src/row/arch/x86_avx2/xv36.rs index a2eb686f..eb5b6440 100644 --- a/src/row/arch/x86_avx2/xv36.rs +++ b/src/row/arch/x86_avx2/xv36.rs @@ -49,7 +49,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Deinterleave helper ------------------------------------------------ @@ -77,7 +77,7 @@ use crate::{ColorMatrix, row::scalar}; /// Caller's `target_feature` must include AVX2. #[inline] #[target_feature(enable = "avx2")] -unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) { +unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) { // SAFETY: caller obligation — `ptr` has 128 bytes readable; AVX2 is // available. unsafe { @@ -88,10 +88,12 @@ unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) // raw_c1 = pixels 4..7 (lo=P4,P5; hi=P6,P7) // raw_c2 = pixels 8..11 (lo=P8,P9; hi=P10,P11) // raw_c3 = pixels 12..15 (lo=P12,P13; hi=P14,P15) - let raw_c0 = _mm256_loadu_si256(ptr.cast()); - let raw_c1 = _mm256_loadu_si256(ptr.add(16).cast()); - let raw_c2 = _mm256_loadu_si256(ptr.add(32).cast()); - let raw_c3 = _mm256_loadu_si256(ptr.add(48).cast()); + // + // For BE wire format, `load_endian_u16x16` byte-swaps each u16 lane. + let raw_c0 = endian::load_endian_u16x16::(ptr as *const u8); + let raw_c1 = endian::load_endian_u16x16::(ptr.add(16) as *const u8); + let raw_c2 = endian::load_endian_u16x16::(ptr.add(32) as *const u8); + let raw_c3 = endian::load_endian_u16x16::(ptr.add(48) as *const u8); // Reshape via cross-lane permute so each register holds the layout the // per-128-bit-lane cascade below expects: @@ -173,7 +175,7 @@ unsafe fn unpack_xv36_16px_avx2(ptr: *const u16) -> (__m256i, __m256i, __m256i) /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( +pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -206,7 +208,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { // Deinterleave 16 XV36 quadruples → U, Y, V as i16x16 in [0, 4095]. - let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2::(packed.as_ptr().add(x * 4)); // Values ≤ 4095 < 32767 — safe to treat as signed i16. let u_i16 = u_u16; @@ -288,7 +290,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -309,7 +317,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -346,7 +354,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = unpack_xv36_16px_avx2::(packed.as_ptr().add(x * 4)); let u_i16 = u_u16; let y_i16 = y_u16; @@ -423,7 +431,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -452,7 +460,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn xv36_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -460,7 +472,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi unsafe { let mut x = 0usize; while x + 16 <= width { - let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2::(packed.as_ptr().add(x * 4)); // y_vec is already >> 4 (values in [0, 4095]). // Scalar does `packed[x*4+1] >> 8` — that is the MSB-aligned value >> 4 @@ -482,7 +494,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // Scalar tail — remaining < 16 pixels. if x < width { - scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } @@ -504,7 +516,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn xv36_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -512,7 +528,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width unsafe { let mut x = 0usize; while x + 16 <= width { - let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = unpack_xv36_16px_avx2::(packed.as_ptr().add(x * 4)); // y_vec already has >> 4 applied (= 12-bit value in [0, 4095]). // Direct store of 16 × u16. @@ -523,7 +539,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width // Scalar tail — remaining < 16 pixels. if x < width { - scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_u16_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_avx512/ayuv64.rs b/src/row/arch/x86_avx512/ayuv64.rs index 3ad3bdee..e0b767a4 100644 --- a/src/row/arch/x86_avx512/ayuv64.rs +++ b/src/row/arch/x86_avx512/ayuv64.rs @@ -74,7 +74,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Static permute index tables ---------------------------------------- @@ -192,7 +192,9 @@ static COMBINE_IDX: [i16; 32] = [ /// AVX-512BW (BW provides `vpermt2w`). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i, __m512i) { +unsafe fn deinterleave_ayuv64_32px_avx512( + ptr: *const u16, +) -> (__m512i, __m512i, __m512i, __m512i) { // SAFETY: caller obligation — `ptr` has 256 bytes readable; AVX-512F + // AVX-512BW are available. unsafe { @@ -203,10 +205,12 @@ unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, // v1 lanes: A8..V8,...,A15..V15 (pixels 8..15) // v2 lanes: A16..V16,...,A23..V23 (pixels 16..23) // v3 lanes: A24..V24,...,A31..V31 (pixels 24..31) - let v0 = _mm512_loadu_si512(ptr.cast()); - let v1 = _mm512_loadu_si512(ptr.add(32).cast()); - let v2 = _mm512_loadu_si512(ptr.add(64).cast()); - let v3 = _mm512_loadu_si512(ptr.add(96).cast()); + // + // For BE wire format, `load_endian_u16x32` byte-swaps each u16 lane. + let v0 = endian::load_endian_u16x32::(ptr as *const u8); + let v1 = endian::load_endian_u16x32::(ptr.add(32) as *const u8); + let v2 = endian::load_endian_u16x32::(ptr.add(64) as *const u8); + let v3 = endian::load_endian_u16x32::(ptr.add(96) as *const u8); // Load permute index tables. let a_idx = _mm512_loadu_si512(A_FROM_PAIR_IDX.as_ptr().cast()); @@ -259,7 +263,11 @@ unsafe fn deinterleave_ayuv64_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -300,7 +308,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4)); // Center chroma: subtract 32768 via wrapping i16 (-32768i16 == 0x8000). let u_lo_i16 = _mm512_sub_epi16(u_lo_u16, bias16_v); @@ -346,7 +354,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4 + 128)); let u_hi_i16 = _mm512_sub_epi16(u_hi_u16, bias16_v); let v_hi_i16 = _mm512_sub_epi16(v_hi_u16, bias16_v); @@ -430,7 +438,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( + scalar::ayuv64_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -463,7 +471,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -508,7 +520,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row(packed.as_ptr().add(x * 4)); // Center chroma via wrapping i16 subtraction. let u_i16 = _mm512_sub_epi16(u_u16, bias16_v); @@ -651,7 +663,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -667,7 +679,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -675,7 +687,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } } @@ -683,7 +695,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( /// to u8 via `>> 8`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -691,14 +703,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } } /// AVX-512 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -706,7 +718,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); } } @@ -714,7 +728,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( /// is written direct (no conversion). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -722,7 +736,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); } } @@ -746,7 +762,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -758,7 +778,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid let mut x = 0usize; while x + 32 <= width { // Deinterleave 32 pixels and discard A/U/V. - let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4)); + let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512::(packed.as_ptr().add(x * 4)); // y_vec is i16x32 with Y0..Y31 (16-bit native). // `>> 8` → high byte of each Y u16. Then narrow to u8. @@ -778,7 +798,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid // Scalar tail. if x < width { - scalar::ayuv64_to_luma_row( + scalar::ayuv64_to_luma_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, @@ -804,7 +824,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -812,7 +836,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] unsafe { let mut x = 0usize; while x + 32 <= width { - let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512(packed.as_ptr().add(x * 4)); + let (_a, y_vec, _u, _v) = deinterleave_ayuv64_32px_avx512::(packed.as_ptr().add(x * 4)); // Direct store — Y samples are 16-bit native, in natural pixel order. _mm512_storeu_si512(luma_out.as_mut_ptr().add(x).cast(), y_vec); x += 32; @@ -820,7 +844,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] // Scalar tail. if x < width { - scalar::ayuv64_to_luma_u16_row( + scalar::ayuv64_to_luma_u16_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, diff --git a/src/row/arch/x86_avx512/tests/ayuv64.rs b/src/row/arch/x86_avx512/tests/ayuv64.rs index a2b2872c..be84a70b 100644 --- a/src/row/arch/x86_avx512/tests/ayuv64.rs +++ b/src/row/arch/x86_avx512/tests/ayuv64.rs @@ -22,9 +22,11 @@ fn check_rgb( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::ayuv64_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::ayuv64_to_rgb_or_rgba_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -43,11 +45,13 @@ fn check_rgb_u16( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( &p, &mut s, width, matrix, full_range, ); unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &p, &mut k, width, matrix, full_range, + ); } assert_eq!( s, @@ -61,9 +65,9 @@ fn check_luma(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::ayuv64_to_luma_row(&p, &mut s, width); + scalar::ayuv64_to_luma_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_row(&p, &mut k, width); + ayuv64_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 ayuv64→luma diverges (width={width})"); } @@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::ayuv64_to_luma_u16_row(&p, &mut s, width); + scalar::ayuv64_to_luma_u16_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_u16_row(&p, &mut k, width); + ayuv64_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 ayuv64→luma u16 diverges (width={width})"); } @@ -195,7 +199,7 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() { // --- luma_u16 path: Y values should be direct (no conversion). --- let mut luma_out = std::vec![0u16; W]; unsafe { - ayuv64_to_luma_u16_row(&packed, &mut luma_out, W); + ayuv64_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!( @@ -208,7 +212,7 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() { // a well-defined Y output. Matrix choice does not affect neutral chroma. let mut rgba_out = std::vec![0u16; W * 4]; unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::( + ayuv64_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgba_out, W, @@ -224,3 +228,162 @@ fn avx512_ayuv64_lane_order_per_pixel_y_and_a() { "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}" ); } + +/// SIMD-level BE-vs-LE parity test for AYUV64 — exercises the host-aware +/// endian gate via `endian::load_endian_u16x*::` and covers the +/// source-α path explicitly via `(ALPHA=true, ALPHA_SRC=true)`. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_ayuv64_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [15usize, 16, 33, 65] { + let intended = pseudo_random_ayuv64(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + ayuv64_to_luma_row::(&le, &mut out_le, w); + ayuv64_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + ayuv64_to_luma_u16_row::(&le, &mut out_le, w); + ayuv64_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx512/tests/v410.rs b/src/row/arch/x86_avx512/tests/v410.rs index d1b1e17c..928eb178 100644 --- a/src/row/arch/x86_avx512/tests/v410.rs +++ b/src/row/arch/x86_avx512/tests/v410.rs @@ -24,9 +24,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u8; width * 3]; let mut k = std::vec![0u8; width * 3]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -38,9 +38,9 @@ fn check_rgba(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u8; width * 4]; let mut k = std::vec![0u8; width * 4]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -52,9 +52,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u16; width * 3]; let mut k = std::vec![0u16; width * 3]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -66,9 +66,9 @@ fn check_rgba_u16(width: usize, matrix: ColorMatrix, full_range: bool) { let p = pseudo_random_v410_words(width, 0xAA55); let mut s = std::vec![0u16; width * 4]; let mut k = std::vec![0u16; width * 4]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, k, @@ -80,9 +80,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v410_words(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v410_to_luma_row(&p, &mut s, width); + scalar::v410_to_luma_row::(&p, &mut s, width); unsafe { - v410_to_luma_row(&p, &mut k, width); + v410_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v410→luma diverges (width={width})"); } @@ -91,9 +91,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v410_words(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v410_to_luma_u16_row(&p, &mut s, width); + scalar::v410_to_luma_u16_row::(&p, &mut s, width); unsafe { - v410_to_luma_u16_row(&p, &mut k, width); + v410_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 v410→luma u16 diverges (width={width})"); } @@ -215,7 +215,7 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v410_to_luma_u16_row(&packed, &mut luma, W); + v410_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "avx512 v410 luma reorder bug"); @@ -224,9 +224,15 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v410_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v410_to_rgb_or_rgba_row::( + scalar::v410_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, @@ -238,3 +244,124 @@ fn avx512_v410_lane_order_per_pixel_y_and_u() { "avx512 v410 SIMD vs scalar diverges — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u32x*::` for AVX-512. Existing per-backend +/// tests use `BE=false` only; existing dispatcher BE-vs-LE tests use +/// `use_simd=false`, so the SIMD endian gate is otherwise untested. Widths +/// 33 and 65 cover ≥1 main-loop iteration plus a scalar tail. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_v410_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [15usize, 16, 33, 65] { + let intended = pseudo_random_v410_words(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "avx512 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "avx512 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + v410_to_luma_row::(&le, &mut out_le, w); + v410_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + v410_to_luma_u16_row::(&le, &mut out_le, w); + v410_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx512/tests/xv36.rs b/src/row/arch/x86_avx512/tests/xv36.rs index c73a4d1a..fd202408 100644 --- a/src/row/arch/x86_avx512/tests/xv36.rs +++ b/src/row/arch/x86_avx512/tests/xv36.rs @@ -17,9 +17,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -34,9 +34,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -50,9 +50,9 @@ fn check_luma(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::xv36_to_luma_row(&p, &mut s, width); + scalar::xv36_to_luma_row::(&p, &mut s, width); unsafe { - xv36_to_luma_row(&p, &mut k, width); + xv36_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 xv36→luma diverges (width={width})"); } @@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::xv36_to_luma_u16_row(&p, &mut s, width); + scalar::xv36_to_luma_u16_row::(&p, &mut s, width); unsafe { - xv36_to_luma_u16_row(&p, &mut k, width); + xv36_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "AVX-512 xv36→luma u16 diverges (width={width})"); } @@ -190,7 +190,7 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1) let mut luma_u16 = std::vec![0u16; W]; unsafe { - xv36_to_luma_u16_row(&packed, &mut luma_u16, W); + xv36_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "avx512 xv36 luma_u16 reorder bug"); @@ -199,9 +199,15 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -213,3 +219,122 @@ fn avx512_xv36_lane_order_per_pixel_y_and_u() { "avx512 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u16x*::` for AVX-512. See sibling v410 test +/// for rationale. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn avx512_xv36_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("avx512f") + || !std::arch::is_x86_feature_detected!("avx512bw") + { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [15usize, 16, 33, 65] { + let intended = pseudo_random_xv36(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "avx512 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "avx512 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + xv36_to_luma_row::(&le, &mut out_le, w); + xv36_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + xv36_to_luma_u16_row::(&le, &mut out_le, w); + xv36_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "avx512 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_avx512/v410.rs b/src/row/arch/x86_avx512/v410.rs index 0862aaf8..79cddd11 100644 --- a/src/row/arch/x86_avx512/v410.rs +++ b/src/row/arch/x86_avx512/v410.rs @@ -30,7 +30,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Bit-extraction helper ----------------------------------------------- @@ -49,11 +49,11 @@ use crate::{ColorMatrix, row::scalar}; /// that `target_feature` includes AVX-512F + AVX-512BW. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn unpack_v410_16px_avx512(ptr: *const u32) -> (__m512i, __m512i, __m512i) { +unsafe fn unpack_v410_16px_avx512(ptr: *const u32) -> (__m512i, __m512i, __m512i) { // SAFETY: caller obligation — `ptr` has 64 bytes readable; AVX-512F // + AVX-512BW are available. unsafe { - let words = _mm512_loadu_si512(ptr.cast()); + let words = endian::load_endian_u32x16::(ptr as *const u8); let mask = _mm512_set1_epi32(0x3FF); // Extract 10-bit fields in i32x16 (values ≤ 1023 — no overflow risk). @@ -87,7 +87,7 @@ unsafe fn unpack_v410_16px_avx512(ptr: *const u32) -> (__m512i, __m512i, __m512i /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v410_to_rgb_or_rgba_row( +pub(crate) unsafe fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -121,7 +121,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let mut x = 0usize; while x + 16 <= width { // Unpack 16 V410 words → three i16x32 with valid data in lanes 0..16. - let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512(packed.as_ptr().add(x)); + let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512::(packed.as_ptr().add(x)); // Subtract chroma bias (512 for 10-bit). let u_sub = _mm512_sub_epi16(u_i16, bias_v); @@ -201,7 +201,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -222,7 +228,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -258,7 +264,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 16 <= width { - let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512(packed.as_ptr().add(x)); + let (u_i16, y_i16, v_i16) = unpack_v410_16px_avx512::(packed.as_ptr().add(x)); let u_sub = _mm512_sub_epi16(u_i16, bias_v); let v_sub = _mm512_sub_epi16(v_i16, bias_v); @@ -332,7 +338,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_u16_or_rgba_u16_row::( + scalar::v410_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -358,7 +364,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn v410_to_luma_row( + packed: &[u32], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -369,7 +379,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi let mut x = 0usize; while x + 16 <= width { - let words = _mm512_loadu_si512(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x16::(packed.as_ptr().add(x) as *const u8); // Y = (word >> 10) & 0x3FF for each i32 lane. let y_i32 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask); @@ -393,7 +403,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi // Scalar tail — remaining < 16 pixels. if x < width { - scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_row::(&packed[x..width], &mut out[x..width], width - x); } } } @@ -414,7 +424,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn v410_to_luma_u16_row( + packed: &[u32], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -424,7 +438,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width let mut x = 0usize; while x + 16 <= width { - let words = _mm512_loadu_si512(packed.as_ptr().add(x).cast()); + let words = endian::load_endian_u32x16::(packed.as_ptr().add(x) as *const u8); // Y = (word >> 10) & 0x3FF for each i32 lane. let y_i32 = _mm512_and_si512(_mm512_srli_epi32::<10>(words), mask); @@ -442,7 +456,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width // Scalar tail — remaining < 16 pixels. if x < width { - scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_u16_row::(&packed[x..width], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_avx512/xv36.rs b/src/row/arch/x86_avx512/xv36.rs index e0833228..fa50208d 100644 --- a/src/row/arch/x86_avx512/xv36.rs +++ b/src/row/arch/x86_avx512/xv36.rs @@ -40,7 +40,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Static permute index tables ----------------------------------------- @@ -145,15 +145,16 @@ static COMBINE_IDX: [i16; 32] = [ /// `vpermt2w` — the u16 cross-vector permute). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -unsafe fn unpack_xv36_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i) { +unsafe fn unpack_xv36_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i) { // SAFETY: caller obligation — `ptr` has 256 bytes readable; AVX-512F + // AVX-512BW are available. unsafe { // Load 4 × __m512i (32 pixels × 4 u16 channels = 128 u16 = 256 bytes). - let v0 = _mm512_loadu_si512(ptr.cast()); // pixels 0.. 7 - let v1 = _mm512_loadu_si512(ptr.add(32).cast()); // pixels 8..15 - let v2 = _mm512_loadu_si512(ptr.add(64).cast()); // pixels 16..23 - let v3 = _mm512_loadu_si512(ptr.add(96).cast()); // pixels 24..31 + // For BE wire format, `load_endian_u16x32` byte-swaps each u16 lane. + let v0 = endian::load_endian_u16x32::(ptr as *const u8); // pixels 0.. 7 + let v1 = endian::load_endian_u16x32::(ptr.add(32) as *const u8); // pixels 8..15 + let v2 = endian::load_endian_u16x32::(ptr.add(64) as *const u8); // pixels 16..23 + let v3 = endian::load_endian_u16x32::(ptr.add(96) as *const u8); // pixels 24..31 // Load permute index tables. let uv_idx = _mm512_loadu_si512(UV_FROM_PAIR_IDX.as_ptr().cast()); @@ -201,7 +202,7 @@ unsafe fn unpack_xv36_32px_avx512(ptr: *const u16) -> (__m512i, __m512i, __m512i /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( +pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -235,7 +236,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let mut x = 0usize; while x + 32 <= width { // Deinterleave 32 XV36 quadruples → U, Y, V as i16x32 in [0, 4095]. - let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512::(packed.as_ptr().add(x * 4)); // Values ≤ 4095 < 32767 — safe to treat as signed i16. let u_i16 = u_u16; @@ -317,7 +318,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -338,7 +345,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -376,7 +383,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 32 <= width { - let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = unpack_xv36_32px_avx512::(packed.as_ptr().add(x * 4)); let u_i16 = u_u16; let y_i16 = y_u16; @@ -437,7 +444,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -466,7 +473,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn xv36_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -477,7 +488,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi let mut x = 0usize; while x + 32 <= width { - let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512::(packed.as_ptr().add(x * 4)); // y_vec is already >> 4 (values in [0, 4095]). // Scalar does `packed[x*4+1] >> 8` — that is MSB-aligned >> 4 to get @@ -495,7 +506,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // Scalar tail — remaining < 32 pixels. if x < width { - scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } @@ -517,7 +528,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn xv36_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -525,7 +540,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width unsafe { let mut x = 0usize; while x + 32 <= width { - let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = unpack_xv36_32px_avx512::(packed.as_ptr().add(x * 4)); // y_vec already has >> 4 applied (= 12-bit value in [0, 4095]). // Direct store of 32 × u16. @@ -536,7 +551,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width // Scalar tail — remaining < 32 pixels. if x < width { - scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_u16_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_sse41/ayuv64.rs b/src/row/arch/x86_sse41/ayuv64.rs index 561a3207..f6b11298 100644 --- a/src/row/arch/x86_sse41/ayuv64.rs +++ b/src/row/arch/x86_sse41/ayuv64.rs @@ -55,7 +55,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Deinterleave helper ------------------------------------------------ @@ -74,13 +74,16 @@ use crate::{ColorMatrix, row::scalar}; /// Caller's `target_feature` must include SSE4.1. #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn deinterleave_ayuv64(ptr: *const u16) -> (__m128i, __m128i, __m128i, __m128i) { +unsafe fn deinterleave_ayuv64( + ptr: *const u16, +) -> (__m128i, __m128i, __m128i, __m128i) { unsafe { // Load 4 × __m128i (8 pixels × 4 channels × u16 = 64 bytes). - let raw0 = _mm_loadu_si128(ptr.cast()); // A0,Y0,U0,V0, A1,Y1,U1,V1 - let raw1 = _mm_loadu_si128(ptr.add(8).cast()); // A2,Y2,U2,V2, A3,Y3,U3,V3 - let raw2 = _mm_loadu_si128(ptr.add(16).cast()); // A4,Y4,U4,V4, A5,Y5,U5,V5 - let raw3 = _mm_loadu_si128(ptr.add(24).cast()); // A6,Y6,U6,V6, A7,Y7,U7,V7 + // BE=true: byte-swap within each u16 lane to correct wire endianness. + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); // A0,Y0,U0,V0, A1,Y1,U1,V1 + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); // A2,Y2,U2,V2, A3,Y3,U3,V3 + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); // A4,Y4,U4,V4, A5,Y5,U5,V5 + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); // A6,Y6,U6,V6, A7,Y7,U7,V7 // Level 1 unpack (pairs 0-1, pairs 2-3). let s1_lo = _mm_unpacklo_epi16(raw0, raw1); // A0,A2,Y0,Y2,U0,U2,V0,V2 @@ -123,7 +126,11 @@ unsafe fn deinterleave_ayuv64(ptr: *const u16) -> (__m128i, __m128i, __m128i, __ /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -159,7 +166,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4)); // Center chroma: subtract 32768 via wrapping i16. let u_lo_i16 = _mm_sub_epi16(u_lo_u16, bias16_v); @@ -200,7 +207,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row(packed.as_ptr().add(x * 4 + 32)); let u_hi_i16 = _mm_sub_epi16(u_hi_u16, bias16_v); let v_hi_i16 = _mm_sub_epi16(v_hi_u16, bias16_v); @@ -266,7 +273,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row( + scalar::ayuv64_to_rgb_or_rgba_row::( tail_packed, tail_out, tail_w, @@ -297,7 +304,11 @@ pub(crate) unsafe fn ayuv64_to_rgb_or_rgba_row= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -333,7 +344,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row(packed.as_ptr().add(x * 4)); // Center chroma via wrapping i16 subtraction. let u_i16 = _mm_sub_epi16(u_u16, bias16_v); @@ -460,7 +471,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -476,7 +487,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_or_rgba_u16_row( packed: &[u16], rgb_out: &mut [u8], width: usize, @@ -484,7 +495,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } } @@ -492,7 +503,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_row( /// to u8 via `>> 8`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_rgba_row( +pub(crate) unsafe fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, @@ -500,14 +511,14 @@ pub(crate) unsafe fn ayuv64_to_rgba_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } } /// SSE4.1 AYUV64 → packed **RGB u16** (3 × u16 per pixel). Source α discarded. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_rgb_u16_row( +pub(crate) unsafe fn ayuv64_to_rgb_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, @@ -515,7 +526,9 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); } } @@ -523,7 +536,7 @@ pub(crate) unsafe fn ayuv64_to_rgb_u16_row( /// is written direct (no conversion). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_rgba_u16_row( +pub(crate) unsafe fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, @@ -531,7 +544,9 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( full_range: bool, ) { unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); } } @@ -551,7 +566,11 @@ pub(crate) unsafe fn ayuv64_to_rgba_u16_row( /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -559,8 +578,8 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid let mut x = 0usize; while x + 16 <= width { // Two deinterleaves for 8 pixels each. - let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64(packed.as_ptr().add(x * 4)); - let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64(packed.as_ptr().add(x * 4 + 32)); + let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64::(packed.as_ptr().add(x * 4)); + let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64::(packed.as_ptr().add(x * 4 + 32)); // >> 8 to get u8 luma (high byte of each Y u16 sample). let y_lo_shr = _mm_srli_epi16::<8>(y_lo); @@ -574,7 +593,7 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid // Scalar tail. if x < width { - scalar::ayuv64_to_luma_row( + scalar::ayuv64_to_luma_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, @@ -597,7 +616,11 @@ pub(crate) unsafe fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], wid /// 3. `luma_out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) unsafe fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); @@ -605,8 +628,8 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] let mut x = 0usize; while x + 16 <= width { // Two deinterleaves for 8 pixels each. - let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64(packed.as_ptr().add(x * 4)); - let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64(packed.as_ptr().add(x * 4 + 32)); + let (_a_lo, y_lo, _u_lo, _v_lo) = deinterleave_ayuv64::(packed.as_ptr().add(x * 4)); + let (_a_hi, y_hi, _u_hi, _v_hi) = deinterleave_ayuv64::(packed.as_ptr().add(x * 4 + 32)); // Direct copy — Y samples are 16-bit native (no shift needed). _mm_storeu_si128(luma_out.as_mut_ptr().add(x).cast(), y_lo); @@ -617,7 +640,7 @@ pub(crate) unsafe fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16] // Scalar tail. if x < width { - scalar::ayuv64_to_luma_u16_row( + scalar::ayuv64_to_luma_u16_row::( &packed[x * 4..width * 4], &mut luma_out[x..width], width - x, diff --git a/src/row/arch/x86_sse41/tests/ayuv64.rs b/src/row/arch/x86_sse41/tests/ayuv64.rs index 1fad101f..50440a01 100644 --- a/src/row/arch/x86_sse41/tests/ayuv64.rs +++ b/src/row/arch/x86_sse41/tests/ayuv64.rs @@ -22,9 +22,11 @@ fn check_rgb( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::ayuv64_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::ayuv64_to_rgb_or_rgba_row::( + &p, &mut s, width, matrix, full_range, + ); unsafe { - ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -43,11 +45,13 @@ fn check_rgb_u16( let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( + scalar::ayuv64_to_rgb_u16_or_rgba_u16_row::( &p, &mut s, width, matrix, full_range, ); unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &p, &mut k, width, matrix, full_range, + ); } assert_eq!( s, @@ -61,9 +65,9 @@ fn check_luma(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::ayuv64_to_luma_row(&p, &mut s, width); + scalar::ayuv64_to_luma_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_row(&p, &mut k, width); + ayuv64_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 ayuv64→luma diverges (width={width})"); } @@ -72,9 +76,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_ayuv64(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::ayuv64_to_luma_u16_row(&p, &mut s, width); + scalar::ayuv64_to_luma_u16_row::(&p, &mut s, width); unsafe { - ayuv64_to_luma_u16_row(&p, &mut k, width); + ayuv64_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 ayuv64→luma u16 diverges (width={width})"); } @@ -167,7 +171,7 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() { // --- luma_u16 path: Y values should be direct (no conversion). --- let mut luma_out = std::vec![0u16; W]; unsafe { - ayuv64_to_luma_u16_row(&packed, &mut luma_out, W); + ayuv64_to_luma_u16_row::(&packed, &mut luma_out, W); } let expected_luma: std::vec::Vec = (1..=16).map(|n| n as u16).collect(); assert_eq!( @@ -180,7 +184,7 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() { // a well-defined Y output. Matrix choice does not affect neutral chroma. let mut rgba_out = std::vec![0u16; W * 4]; unsafe { - ayuv64_to_rgb_u16_or_rgba_u16_row::( + ayuv64_to_rgb_u16_or_rgba_u16_row::( &packed, &mut rgba_out, W, @@ -196,3 +200,160 @@ fn sse41_ayuv64_lane_order_per_pixel_y_and_a() { "rgba_u16: A lane order incorrect — expected A[n]=2n+1, got {alpha_out:?}" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u16x8::` for AYUV64. Covers the source-α +/// path explicitly via `(ALPHA=true, ALPHA_SRC=true)`. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_ayuv64_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_ayuv64(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + { + let mut out_le = std::vec![0u8; w * 3]; + let mut out_be = std::vec![0u8; w * 3]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgb, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w * 4]; + let mut out_be = std::vec![0u8; w * 4]; + unsafe { + ayuv64_to_rgb_or_rgba_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + false, + ); + ayuv64_to_rgb_or_rgba_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + false, + ); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgba+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 3]; + let mut out_be = std::vec![0u16; w * 3]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgb u16, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w * 4]; + let mut out_be = std::vec![0u16; w * 4]; + unsafe { + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + ayuv64_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (rgba u16+srcα, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + ayuv64_to_luma_row::(&le, &mut out_le, w); + ayuv64_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + ayuv64_to_luma_u16_row::(&le, &mut out_le, w); + ayuv64_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 ayuv64 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_sse41/tests/v410.rs b/src/row/arch/x86_sse41/tests/v410.rs index ee908301..07b9d911 100644 --- a/src/row/arch/x86_sse41/tests/v410.rs +++ b/src/row/arch/x86_sse41/tests/v410.rs @@ -27,9 +27,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -44,9 +44,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -60,9 +60,9 @@ fn check_luma(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::v410_to_luma_row(&p, &mut s, width); + scalar::v410_to_luma_row::(&p, &mut s, width); unsafe { - v410_to_luma_row(&p, &mut k, width); + v410_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v410→luma diverges (width={width})"); } @@ -71,9 +71,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_v410(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::v410_to_luma_u16_row(&p, &mut s, width); + scalar::v410_to_luma_u16_row::(&p, &mut s, width); unsafe { - v410_to_luma_u16_row(&p, &mut k, width); + v410_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 v410→luma u16 diverges (width={width})"); } @@ -181,7 +181,7 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order (u16, no shift loss) let mut luma = std::vec![0u16; W]; unsafe { - v410_to_luma_u16_row(&packed, &mut luma, W); + v410_to_luma_u16_row::(&packed, &mut luma, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma, expected_luma, "sse4.1 v410 luma reorder bug"); @@ -190,9 +190,15 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u8; W * 3]; let mut scalar_rgb = std::vec![0u8; W * 3]; unsafe { - v410_to_rgb_or_rgba_row::(&packed, &mut simd_rgb, W, crate::ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::( + &packed, + &mut simd_rgb, + W, + crate::ColorMatrix::Bt709, + false, + ); } - scalar::v410_to_rgb_or_rgba_row::( + scalar::v410_to_rgb_or_rgba_row::( &packed, &mut scalar_rgb, W, @@ -204,3 +210,125 @@ fn sse41_v410_lane_order_per_pixel_y_and_u() { "sse4.1 v410 SIMD vs scalar diverges — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u32x4::`. Existing per-backend tests use +/// `BE=false` only; existing dispatcher BE-vs-LE tests use `use_simd=false`, +/// so the SIMD endian gate is otherwise untested. +/// +/// On an LE host: +/// - SIMD `<…BE=false>` on LE input → no-swap path. +/// - SIMD `<…BE=true>` on BE input → swap path. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_v410_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_v410(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + v410_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "sse4.1 v410 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + v410_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + v410_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "sse4.1 v410 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + v410_to_luma_row::(&le, &mut out_le, w); + v410_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 v410 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + v410_to_luma_u16_row::(&le, &mut out_le, w); + v410_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 v410 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_sse41/tests/xv36.rs b/src/row/arch/x86_sse41/tests/xv36.rs index c6d18c9a..b4f1dd0d 100644 --- a/src/row/arch/x86_sse41/tests/xv36.rs +++ b/src/row/arch/x86_sse41/tests/xv36.rs @@ -17,9 +17,9 @@ fn check_rgb(width: usize, matrix: ColorMatrix, full_range: b let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u8; width * bpp]; let mut k = std::vec![0u8; width * bpp]; - scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_or_rgba_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -34,9 +34,9 @@ fn check_rgb_u16(width: usize, matrix: ColorMatrix, full_rang let bpp = if ALPHA { 4 } else { 3 }; let mut s = std::vec![0u16; width * bpp]; let mut k = std::vec![0u16; width * bpp]; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); + scalar::xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut s, width, matrix, full_range); unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); + xv36_to_rgb_u16_or_rgba_u16_row::(&p, &mut k, width, matrix, full_range); } assert_eq!( s, @@ -50,9 +50,9 @@ fn check_luma(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u8; width]; let mut k = std::vec![0u8; width]; - scalar::xv36_to_luma_row(&p, &mut s, width); + scalar::xv36_to_luma_row::(&p, &mut s, width); unsafe { - xv36_to_luma_row(&p, &mut k, width); + xv36_to_luma_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 xv36→luma diverges (width={width})"); } @@ -61,9 +61,9 @@ fn check_luma_u16(width: usize) { let p = pseudo_random_xv36(width, 0xC001); let mut s = std::vec![0u16; width]; let mut k = std::vec![0u16; width]; - scalar::xv36_to_luma_u16_row(&p, &mut s, width); + scalar::xv36_to_luma_u16_row::(&p, &mut s, width); unsafe { - xv36_to_luma_u16_row(&p, &mut k, width); + xv36_to_luma_u16_row::(&p, &mut k, width); } assert_eq!(s, k, "SSE4.1 xv36→luma u16 diverges (width={width})"); } @@ -176,7 +176,7 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() { // Part 1: Luma natural-order at u16 (drops the 4-bit padding to recover n+1) let mut luma_u16 = std::vec![0u16; W]; unsafe { - xv36_to_luma_u16_row(&packed, &mut luma_u16, W); + xv36_to_luma_u16_row::(&packed, &mut luma_u16, W); } let expected_luma: std::vec::Vec = (1..=W as u16).collect(); assert_eq!(luma_u16, expected_luma, "sse4.1 xv36 luma_u16 reorder bug"); @@ -185,9 +185,15 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() { let mut simd_rgb = std::vec![0u16; W * 3]; let mut scalar_rgb = std::vec![0u16; W * 3]; unsafe { - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut simd_rgb, W, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::( + &packed, + &mut simd_rgb, + W, + ColorMatrix::Bt709, + false, + ); } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( &packed, &mut scalar_rgb, W, @@ -199,3 +205,119 @@ fn sse41_xv36_lane_order_per_pixel_y_and_u() { "sse4.1 xv36 SIMD vs scalar diverges (u16 RGB) — lane-order bug" ); } + +/// SIMD-level BE-vs-LE parity test — exercises the host-aware endian gate +/// in `endian::load_endian_u16x8::`. See sibling v410 test for rationale. +#[test] +#[cfg_attr( + miri, + ignore = "SIMD-dispatched row kernels use intrinsics unsupported by Miri" +)] +fn sse41_xv36_be_le_simd_parity() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent. The earlier `swap_bytes` pattern only + // validated this on LE hosts (on BE hosts both buffers degenerate to + // equal-but-wrong values and the test passed vacuously). + for w in [7usize, 8, 17, 33] { + let intended = pseudo_random_xv36(w, 0xBEEF); + let le_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: std::vec::Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le: std::vec::Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be: std::vec::Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u8; w * bpp]; + let mut out_be = std::vec![0u8; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } else { + xv36_to_rgb_or_rgba_row::(&le, &mut out_le, w, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be, &mut out_be, w, ColorMatrix::Bt709, false); + } + } + assert_eq!( + out_le, out_be, + "sse4.1 xv36 BE-vs-LE SIMD parity failed (alpha={alpha}, w={w})" + ); + } + + for (alpha, bpp) in [(false, 3usize), (true, 4)] { + let mut out_le = std::vec![0u16; w * bpp]; + let mut out_be = std::vec![0u16; w * bpp]; + unsafe { + if alpha { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } else { + xv36_to_rgb_u16_or_rgba_u16_row::( + &le, + &mut out_le, + w, + ColorMatrix::Bt709, + true, + ); + xv36_to_rgb_u16_or_rgba_u16_row::( + &be, + &mut out_be, + w, + ColorMatrix::Bt709, + true, + ); + } + } + assert_eq!( + out_le, out_be, + "sse4.1 xv36 BE-vs-LE SIMD parity failed (u16, alpha={alpha}, w={w})" + ); + } + + { + let mut out_le = std::vec![0u8; w]; + let mut out_be = std::vec![0u8; w]; + unsafe { + xv36_to_luma_row::(&le, &mut out_le, w); + xv36_to_luma_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 xv36 BE-vs-LE SIMD parity failed (luma u8, w={w})" + ); + } + + { + let mut out_le = std::vec![0u16; w]; + let mut out_be = std::vec![0u16; w]; + unsafe { + xv36_to_luma_u16_row::(&le, &mut out_le, w); + xv36_to_luma_u16_row::(&be, &mut out_be, w); + } + assert_eq!( + out_le, out_be, + "sse4.1 xv36 BE-vs-LE SIMD parity failed (luma u16, w={w})" + ); + } + } +} diff --git a/src/row/arch/x86_sse41/v410.rs b/src/row/arch/x86_sse41/v410.rs index 0c38d996..adb0ce98 100644 --- a/src/row/arch/x86_sse41/v410.rs +++ b/src/row/arch/x86_sse41/v410.rs @@ -24,7 +24,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- u8 RGB / RGBA output (8 px/iter) ----------------------------------- @@ -40,7 +40,7 @@ use crate::{ColorMatrix, row::scalar}; /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v410_to_rgb_or_rgba_row( +pub(crate) unsafe fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -73,8 +73,8 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let mut x = 0usize; while x + 8 <= width { // Load 8 V410 words = 8 pixels (32 bytes = 2 × __m128i). - let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast()); - let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast()); + let words_lo = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); + let words_hi = endian::load_endian_u32x4::(packed.as_ptr().add(x + 4) as *const u8); // Extract U (bits 9:0), Y (bits 19:10), V (bits 29:20) for each // 4-pixel batch as i32x4. Values ≤ 1023 — safe for i16. @@ -154,7 +154,13 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::v410_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -173,7 +179,7 @@ pub(crate) unsafe fn v410_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -208,8 +214,8 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast()); - let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast()); + let words_lo = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); + let words_hi = endian::load_endian_u32x4::(packed.as_ptr().add(x + 4) as *const u8); let u_lo_i32 = _mm_and_si128(words_lo, mask); let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask); @@ -283,7 +289,7 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x..width]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::v410_to_rgb_u16_or_rgba_u16_row::( + scalar::v410_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -307,7 +313,11 @@ pub(crate) unsafe fn v410_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) unsafe fn v410_to_luma_row( + packed: &[u32], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -316,8 +326,8 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi let mut x = 0usize; while x + 8 <= width { - let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast()); - let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast()); + let words_lo = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); + let words_hi = endian::load_endian_u32x4::(packed.as_ptr().add(x + 4) as *const u8); // Y = (word >> 10) & 0x3FF for each lane. let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask); @@ -340,7 +350,7 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi // Scalar tail. if x < width { - scalar::v410_to_luma_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_row::(&packed[x..width], &mut out[x..width], width - x); } } } @@ -359,7 +369,11 @@ pub(crate) unsafe fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) unsafe fn v410_to_luma_u16_row( + packed: &[u32], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); @@ -368,8 +382,8 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width let mut x = 0usize; while x + 8 <= width { - let words_lo = _mm_loadu_si128(packed.as_ptr().add(x).cast()); - let words_hi = _mm_loadu_si128(packed.as_ptr().add(x + 4).cast()); + let words_lo = endian::load_endian_u32x4::(packed.as_ptr().add(x) as *const u8); + let words_hi = endian::load_endian_u32x4::(packed.as_ptr().add(x + 4) as *const u8); // Y = (word >> 10) & 0x3FF for each lane. let y_lo_i32 = _mm_and_si128(_mm_srli_epi32::<10>(words_lo), mask); @@ -386,7 +400,7 @@ pub(crate) unsafe fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width // Scalar tail. if x < width { - scalar::v410_to_luma_u16_row(&packed[x..width], &mut out[x..width], width - x); + scalar::v410_to_luma_u16_row::(&packed[x..width], &mut out[x..width], width - x); } } } diff --git a/src/row/arch/x86_sse41/xv36.rs b/src/row/arch/x86_sse41/xv36.rs index 7beaf02c..cda808ab 100644 --- a/src/row/arch/x86_sse41/xv36.rs +++ b/src/row/arch/x86_sse41/xv36.rs @@ -53,7 +53,7 @@ use core::arch::x86_64::*; -use super::*; +use super::{endian, *}; use crate::{ColorMatrix, row::scalar}; // ---- Deinterleave helper ------------------------------------------------ @@ -63,6 +63,9 @@ use crate::{ColorMatrix, row::scalar}; /// `u16` samples **after** the 4-bit right-shift to drop padding LSBs. /// The A channel is computed but returned separately (caller discards it). /// +/// When `BE = true`, each 128-bit load is byte-swapped within every 2-byte +/// lane via `endian::load_endian_u16x8::`. +/// /// See module-level doc for the 3-level unpack cascade. /// /// # Safety @@ -71,13 +74,14 @@ use crate::{ColorMatrix, row::scalar}; /// Caller's `target_feature` must include SSE4.1. #[inline] #[target_feature(enable = "sse4.1")] -unsafe fn deinterleave_xv36(ptr: *const u16) -> (__m128i, __m128i, __m128i) { +unsafe fn deinterleave_xv36(ptr: *const u16) -> (__m128i, __m128i, __m128i) { unsafe { // Load 4 × __m128i (8 pixels × 4 channels × u16 = 64 bytes). - let raw0 = _mm_loadu_si128(ptr.cast()); // U0,Y0,V0,A0,U1,Y1,V1,A1 - let raw1 = _mm_loadu_si128(ptr.add(8).cast()); // U2,Y2,V2,A2,U3,Y3,V3,A3 - let raw2 = _mm_loadu_si128(ptr.add(16).cast()); // U4,Y4,V4,A4,U5,Y5,V5,A5 - let raw3 = _mm_loadu_si128(ptr.add(24).cast()); // U6,Y6,V6,A6,U7,Y7,V7,A7 + // BE=true: byte-swap within each u16 lane to correct wire endianness. + let raw0 = endian::load_endian_u16x8::(ptr as *const u8); // U0,Y0,V0,A0,U1,Y1,V1,A1 + let raw1 = endian::load_endian_u16x8::(ptr.add(8) as *const u8); // U2,Y2,V2,A2,U3,Y3,V3,A3 + let raw2 = endian::load_endian_u16x8::(ptr.add(16) as *const u8); // U4,Y4,V4,A4,U5,Y5,V5,A5 + let raw3 = endian::load_endian_u16x8::(ptr.add(24) as *const u8); // U6,Y6,V6,A6,U7,Y7,V7,A7 // Level 1 unpack (pairs 0-1, pairs 2-3). let s1_lo = _mm_unpacklo_epi16(raw0, raw1); // U0,U2,Y0,Y2,V0,V2,A0,A2 @@ -119,7 +123,7 @@ unsafe fn deinterleave_xv36(ptr: *const u16) -> (__m128i, __m128i, __m128i) { /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( +pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -151,7 +155,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let mut x = 0usize; while x + 8 <= width { // Deinterleave 8 XV36 quadruples → U, Y, V as i16x8 in [0, 4095]. - let (u_u16, y_u16, v_u16) = deinterleave_xv36(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = deinterleave_xv36::(packed.as_ptr().add(x * 4)); // Reinterpret as signed i16 (values ≤ 4095 < 32767, safe). let u_i16 = u_u16; // u16 values fit in i16 range @@ -221,7 +225,13 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_or_rgba_row::(tail_packed, tail_out, tail_w, matrix, full_range); + scalar::xv36_to_rgb_or_rgba_row::( + tail_packed, + tail_out, + tail_w, + matrix, + full_range, + ); } } } @@ -240,7 +250,7 @@ pub(crate) unsafe fn xv36_to_rgb_or_rgba_row( /// 3. `out.len() >= width * (if ALPHA { 4 } else { 3 })` (u16 elements). #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -276,7 +286,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let mut x = 0usize; while x + 8 <= width { - let (u_u16, y_u16, v_u16) = deinterleave_xv36(packed.as_ptr().add(x * 4)); + let (u_u16, y_u16, v_u16) = deinterleave_xv36::(packed.as_ptr().add(x * 4)); let u_i16 = u_u16; let y_i16 = y_u16; @@ -341,7 +351,7 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( let tail_packed = &packed[x * 4..width * 4]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( tail_packed, tail_out, tail_w, @@ -367,7 +377,11 @@ pub(crate) unsafe fn xv36_to_rgb_u16_or_rgba_u16_row( /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) unsafe fn xv36_to_luma_row( + packed: &[u16], + out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -375,7 +389,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi let mut x = 0usize; while x + 8 <= width { // Deinterleave to get Y channel, then shift >> 8 for u8 luma. - let (_u_vec, y_vec, _v_vec) = deinterleave_xv36(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = deinterleave_xv36::(packed.as_ptr().add(x * 4)); // y_vec already has >> 4 applied (values in [0, 4095]). // Scalar does `packed[x*4+1] >> 8` — that's (MSB-aligned >> 4) >> 4 @@ -395,7 +409,7 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi // Scalar tail. if x < width { - scalar::xv36_to_luma_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } @@ -415,7 +429,11 @@ pub(crate) unsafe fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usi /// 3. `out.len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) unsafe fn xv36_to_luma_u16_row( + packed: &[u16], + out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); @@ -423,7 +441,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width let mut x = 0usize; while x + 8 <= width { // Deinterleave — y_vec already has >> 4 applied (= 12-bit value). - let (_u_vec, y_vec, _v_vec) = deinterleave_xv36(packed.as_ptr().add(x * 4)); + let (_u_vec, y_vec, _v_vec) = deinterleave_xv36::(packed.as_ptr().add(x * 4)); // Direct store of 8 × u16 (12-bit values in low bits). _mm_storeu_si128(out.as_mut_ptr().add(x).cast(), y_vec); @@ -433,7 +451,7 @@ pub(crate) unsafe fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width // Scalar tail. if x < width { - scalar::xv36_to_luma_u16_row(&packed[x * 4..width * 4], &mut out[x..width], width - x); + scalar::xv36_to_luma_u16_row::(&packed[x * 4..width * 4], &mut out[x..width], width - x); } } } diff --git a/src/row/dispatch/ayuv64.rs b/src/row/dispatch/ayuv64.rs index 4c1442ac..0d757ee8 100644 --- a/src/row/dispatch/ayuv64.rs +++ b/src/row/dispatch/ayuv64.rs @@ -14,6 +14,9 @@ //! //! Source α is real (depth-converted u16 → u8 via `>> 8` for u8 RGBA; //! written direct as u16 for u16 RGBA). +//! +//! `be_input = true` selects the big-endian wire variant: each u16 +//! element is byte-swapped before unpacking, matching BE AYUV64 streams. #[cfg(any( target_arch = "aarch64", @@ -46,7 +49,8 @@ fn ayuv64_packed_elems(width: usize) -> usize { /// Converts one row of AYUV64 to packed RGB (u8). Source α is discarded. /// See [`scalar::ayuv64_to_rgb_or_rgba_row`] for pixel layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `be_input = true` selects +/// the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn ayuv64_to_rgb_row( packed: &[u16], @@ -55,6 +59,7 @@ pub fn ayuv64_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= ayuv64_packed_elems(width), @@ -70,31 +75,51 @@ pub fn ayuv64_to_rgb_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -102,12 +127,16 @@ pub fn ayuv64_to_rgb_row( } } - scalar::ayuv64_to_rgb_row(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); + } else { + scalar::ayuv64_to_rgb_row::(packed, rgb_out, width, matrix, full_range); + } } /// Converts one row of AYUV64 to packed RGBA (u8). The source A u16 at slot 0 /// of each pixel quadruple is depth-converted to u8 via `>> 8`. `use_simd = -/// false` forces scalar. +/// false` forces scalar. `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn ayuv64_to_rgba_row( packed: &[u16], @@ -116,6 +145,7 @@ pub fn ayuv64_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= ayuv64_packed_elems(width), @@ -131,31 +161,51 @@ pub fn ayuv64_to_rgba_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -163,11 +213,16 @@ pub fn ayuv64_to_rgba_row( } } - scalar::ayuv64_to_rgba_row(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); + } else { + scalar::ayuv64_to_rgba_row::(packed, rgba_out, width, matrix, full_range); + } } /// Converts one row of AYUV64 to packed `u16` RGB at native 16-bit /// depth. Source α is discarded. `use_simd = false` forces scalar. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn ayuv64_to_rgb_u16_row( packed: &[u16], @@ -176,6 +231,7 @@ pub fn ayuv64_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= ayuv64_packed_elems(width), @@ -191,31 +247,51 @@ pub fn ayuv64_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -223,12 +299,17 @@ pub fn ayuv64_to_rgb_u16_row( } } - scalar::ayuv64_to_rgb_u16_row(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); + } else { + scalar::ayuv64_to_rgb_u16_row::(packed, rgb_out, width, matrix, full_range); + } } /// Converts one row of AYUV64 to packed `u16` RGBA at native 16-bit /// depth. The source A u16 at slot 0 of each pixel quadruple is written /// direct (no conversion). `use_simd = false` forces scalar. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn ayuv64_to_rgba_u16_row( packed: &[u16], @@ -237,6 +318,7 @@ pub fn ayuv64_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= ayuv64_packed_elems(width), @@ -252,31 +334,51 @@ pub fn ayuv64_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -284,14 +386,24 @@ pub fn ayuv64_to_rgba_u16_row( } } - scalar::ayuv64_to_rgba_u16_row(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + } else { + scalar::ayuv64_to_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + } } /// Extracts one row of 8-bit luma from a packed AYUV64 buffer. Y is at slot 1 /// of each pixel quadruple; extracted via `>> 8` (high byte). `use_simd = -/// false` forces scalar. +/// false` forces scalar. `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!( packed.len() >= ayuv64_packed_elems(width), "packed row too short" @@ -303,31 +415,51 @@ pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::ayuv64_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::ayuv64_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::ayuv64_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_luma_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_luma_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_luma_row::(packed, luma_out, width); } + } return; } }, @@ -335,14 +467,25 @@ pub fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use } } - scalar::ayuv64_to_luma_row(packed, luma_out, width); + if be_input { + scalar::ayuv64_to_luma_row::(packed, luma_out, width); + } else { + scalar::ayuv64_to_luma_row::(packed, luma_out, width); + } } /// Extracts one row of native-depth `u16` luma from a packed AYUV64 buffer. /// Y is at slot 1 of each pixel quadruple; written direct (no shift — 16-bit -/// native). `use_simd = false` forces scalar. +/// native). `use_simd = false` forces scalar. `be_input = true` selects the +/// big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!( packed.len() >= ayuv64_packed_elems(width), "packed row too short" @@ -354,31 +497,51 @@ pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::ayuv64_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::ayuv64_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, @@ -386,7 +549,11 @@ pub fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize } } - scalar::ayuv64_to_luma_u16_row(packed, luma_out, width); + if be_input { + scalar::ayuv64_to_luma_u16_row::(packed, luma_out, width); + } else { + scalar::ayuv64_to_luma_u16_row::(packed, luma_out, width); + } } #[cfg(all(test, feature = "std"))] @@ -404,6 +571,16 @@ mod tests { [a, y, u, v] } + /// Pack one AYUV64 pixel in big-endian wire format. + fn pack_ayuv64_be(a: u16, y: u16, u: u16, v: u16) -> [u16; 4] { + [ + a.swap_bytes(), + y.swap_bytes(), + u.swap_bytes(), + v.swap_bytes(), + ] + } + /// Build a `Vec` AYUV64 row of `width` pixels with neutral /// chroma (U=V=32768) and the given Y / alpha values. Any positive /// width is valid (4:4:4, no chroma subsampling). @@ -412,6 +589,12 @@ mod tests { (0..width).flat_map(|_| quad).collect() } + /// Build a `Vec` AYUV64 row in big-endian wire format. + fn solid_ayuv64_be(width: usize, y: u16, a: u16) -> std::vec::Vec { + let quad = pack_ayuv64_be(a, y, 32768, 32768); + (0..width).flat_map(|_| quad).collect() + } + // ---- panic guards ------------------------------------------------------- #[test] @@ -420,7 +603,7 @@ mod tests { // packed buffer has only 2×4=8 u16 elements for width=4 (needs 4×4=16). let packed = [0u16; 8]; let mut rgb = [0u8; 4 * 3]; - ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -428,7 +611,7 @@ mod tests { fn ayuv64_dispatcher_rejects_short_rgb_output() { let packed = [0u16; 4 * 4]; let mut rgb = [0u8; 2]; - ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + ayuv64_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -436,7 +619,15 @@ mod tests { fn ayuv64_dispatcher_rejects_short_rgba_output() { let packed = [0u16; 4 * 4]; let mut rgba = [0u8; 2]; - ayuv64_to_rgba_row(&packed, &mut rgba, 4, ColorMatrix::Bt709, true, false); + ayuv64_to_rgba_row( + &packed, + &mut rgba, + 4, + ColorMatrix::Bt709, + true, + false, + false, + ); } #[test] @@ -444,7 +635,7 @@ mod tests { fn ayuv64_dispatcher_rejects_short_rgb_u16_output() { let packed = [0u16; 4 * 4]; let mut rgb = [0u16; 2]; - ayuv64_to_rgb_u16_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + ayuv64_to_rgb_u16_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -452,7 +643,15 @@ mod tests { fn ayuv64_dispatcher_rejects_short_rgba_u16_output() { let packed = [0u16; 4 * 4]; let mut rgba = [0u16; 2]; - ayuv64_to_rgba_u16_row(&packed, &mut rgba, 4, ColorMatrix::Bt709, true, false); + ayuv64_to_rgba_u16_row( + &packed, + &mut rgba, + 4, + ColorMatrix::Bt709, + true, + false, + false, + ); } #[test] @@ -460,7 +659,7 @@ mod tests { fn ayuv64_dispatcher_rejects_short_luma_output() { let packed = [0u16; 4 * 4]; let mut luma = [0u8; 2]; - ayuv64_to_luma_row(&packed, &mut luma, 4, false); + ayuv64_to_luma_row(&packed, &mut luma, 4, false, false); } #[test] @@ -468,7 +667,7 @@ mod tests { fn ayuv64_dispatcher_rejects_short_luma_u16_output() { let packed = [0u16; 4 * 4]; let mut luma = [0u16; 2]; - ayuv64_to_luma_u16_row(&packed, &mut luma, 4, false); + ayuv64_to_luma_u16_row(&packed, &mut luma, 4, false, false); } // ---- functional smoke --------------------------------------------------- @@ -482,7 +681,7 @@ mod tests { // u8 RGB — limited-range white → near 255 on every channel let mut rgb = [0u8; 8 * 3]; - ayuv64_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, false, false); + ayuv64_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, false, false, false); for px in rgb.chunks(3) { assert!( px[0].abs_diff(255) <= 2, @@ -495,7 +694,7 @@ mod tests { // u8 RGBA — source α 0xABCD >> 8 = 0xAB in output α channel let mut rgba = [0u8; 8 * 4]; - ayuv64_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, false, false); + ayuv64_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, false, false, false); for px in rgba.chunks(4) { assert!( px[0].abs_diff(255) <= 2, @@ -510,7 +709,15 @@ mod tests { // u16 RGB — near-white (65535 or close) let mut rgb_u16 = [0u16; 8 * 3]; - ayuv64_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, false, false); + ayuv64_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + false, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!( px[0].abs_diff(0xFFFF) <= 256, @@ -523,7 +730,15 @@ mod tests { // u16 RGBA — source α 0xABCD must appear direct in output α channel let mut rgba_u16 = [0u16; 8 * 4]; - ayuv64_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, false, false); + ayuv64_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + false, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!( px[3], 0xABCDu16, @@ -533,19 +748,90 @@ mod tests { // u8 luma — Y=60160; >> 8 = 234 (0xEA) let mut luma = [0u8; 8]; - ayuv64_to_luma_row(&buf, &mut luma, 8, false); + ayuv64_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (60160u16 >> 8) as u8, "luma u8 must be Y >> 8"); } // u16 luma — Y=60160 written direct let mut luma_u16 = [0u16; 8]; - ayuv64_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + ayuv64_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 60160u16, "luma u16 must be Y direct"); } } + #[test] + fn ayuv64_be_and_le_dispatchers_agree() { + // BE-encoded data decoded with be_input=true must produce the same + // output as LE-encoded data decoded with be_input=false. + // Use a distinctive Y/alpha so both output channels are exercised. + let le_buf = solid_ayuv64(8, 60160, 0xABCD); + let be_buf = solid_ayuv64_be(8, 60160, 0xABCD); + + // u8 RGB + let mut rgb_le = [0u8; 8 * 3]; + let mut rgb_be = [0u8; 8 * 3]; + ayuv64_to_rgb_row( + &le_buf, + &mut rgb_le, + 8, + ColorMatrix::Bt709, + false, + false, + false, + ); + ayuv64_to_rgb_row( + &be_buf, + &mut rgb_be, + 8, + ColorMatrix::Bt709, + false, + false, + true, + ); + assert_eq!( + rgb_le, rgb_be, + "LE and BE must produce identical RGB output" + ); + + // u8 luma + let mut luma_le = [0u8; 8]; + let mut luma_be = [0u8; 8]; + ayuv64_to_luma_row(&le_buf, &mut luma_le, 8, false, false); + ayuv64_to_luma_row(&be_buf, &mut luma_be, 8, false, true); + assert_eq!( + luma_le, luma_be, + "LE and BE must produce identical luma output" + ); + + // u16 RGBA — alpha pass-through must survive BE swap + let mut rgba_u16_le = [0u16; 8 * 4]; + let mut rgba_u16_be = [0u16; 8 * 4]; + ayuv64_to_rgba_u16_row( + &le_buf, + &mut rgba_u16_le, + 8, + ColorMatrix::Bt709, + false, + false, + false, + ); + ayuv64_to_rgba_u16_row( + &be_buf, + &mut rgba_u16_be, + 8, + ColorMatrix::Bt709, + false, + false, + true, + ); + assert_eq!( + rgba_u16_le, rgba_u16_be, + "LE and BE must produce identical u16 RGBA output" + ); + } + // ---- 32-bit width × 4 overflow guard ------------------------------------ // // AYUV64 packed rows consume `4 * width` u16 elements. Without the @@ -572,6 +858,7 @@ mod tests { ColorMatrix::Bt709, true, false, + false, ); } } diff --git a/src/row/dispatch/v410.rs b/src/row/dispatch/v410.rs index d51e50cc..436e695b 100644 --- a/src/row/dispatch/v410.rs +++ b/src/row/dispatch/v410.rs @@ -11,6 +11,10 @@ //! complete pixel as 10-bit U / Y / V packed into bits [9:0] / [19:10] //! / [29:20] with 2-bit padding at the top. Buffer length is `width` //! u32 elements — no even-width restriction, no width×2 scaling. +//! +//! `be_input = true` selects the big-endian wire variant: each u32 word +//! is byte-swapped before unpacking, matching QuickTime-style BE V410 +//! streams. #[cfg(any( target_arch = "aarch64", @@ -31,7 +35,8 @@ use crate::{ /// Converts one row of V410 to packed RGB (u8). See /// [`scalar::v410_to_rgb_or_rgba_row`] for word layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `be_input = true` selects +/// the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn v410_to_rgb_row( packed: &[u32], @@ -40,6 +45,7 @@ pub fn v410_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!(packed.len() >= width, "packed row too short"); assert!( @@ -52,31 +58,51 @@ pub fn v410_to_rgb_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -84,10 +110,15 @@ pub fn v410_to_rgb_row( } } - scalar::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + } else { + scalar::v410_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + } } /// Converts one row of V410 to packed RGBA (u8) with `α = 0xFF`. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn v410_to_rgba_row( packed: &[u32], @@ -96,6 +127,7 @@ pub fn v410_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!(packed.len() >= width, "packed row too short"); assert!( @@ -108,31 +140,51 @@ pub fn v410_to_rgba_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -140,11 +192,16 @@ pub fn v410_to_rgba_row( } } - scalar::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + } else { + scalar::v410_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + } } /// Converts one row of V410 to packed `u16` RGB at native 10-bit -/// depth (low-bit-packed, `[0, 1023]`). +/// depth (low-bit-packed, `[0, 1023]`). `be_input = true` selects +/// the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn v410_to_rgb_u16_row( packed: &[u32], @@ -153,6 +210,7 @@ pub fn v410_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!(packed.len() >= width, "packed row too short"); assert!( @@ -165,31 +223,51 @@ pub fn v410_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -197,11 +275,20 @@ pub fn v410_to_rgb_u16_row( } } - scalar::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::v410_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); + } else { + scalar::v410_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); + } } /// Converts one row of V410 to packed `u16` RGBA at native 10-bit -/// depth with `α = 1023` (10-bit opaque maximum). +/// depth with `α = 1023` (10-bit opaque maximum). `be_input = true` +/// selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn v410_to_rgba_u16_row( packed: &[u32], @@ -210,6 +297,7 @@ pub fn v410_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!(packed.len() >= width, "packed row too short"); assert!( @@ -222,31 +310,51 @@ pub fn v410_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -254,13 +362,28 @@ pub fn v410_to_rgba_u16_row( } } - scalar::v410_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::v410_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); + } else { + scalar::v410_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); + } } /// Extracts one row of 8-bit luma from a packed V410 buffer. /// Y values are downshifted from 10-bit to 8-bit via `>> 2`. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn v410_to_luma_row( + packed: &[u32], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!(packed.len() >= width, "packed row too short"); assert!(luma_out.len() >= width, "luma_out row too short"); @@ -269,31 +392,51 @@ pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_s target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v410_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::v410_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::v410_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::v410_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::v410_to_luma_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::v410_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::v410_to_luma_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::v410_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::v410_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::v410_to_luma_row::(packed, luma_out, width); } + } return; } }, @@ -301,14 +444,25 @@ pub fn v410_to_luma_row(packed: &[u32], luma_out: &mut [u8], width: usize, use_s } } - scalar::v410_to_luma_row(packed, luma_out, width); + if be_input { + scalar::v410_to_luma_row::(packed, luma_out, width); + } else { + scalar::v410_to_luma_row::(packed, luma_out, width); + } } /// Extracts one row of native-depth `u16` luma from a packed V410 /// buffer (low-bit-packed: each `u16` carries the 10-bit Y value in -/// its low 10 bits). +/// its low 10 bits). `be_input = true` selects the big-endian wire +/// variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn v410_to_luma_u16_row( + packed: &[u32], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!(packed.len() >= width, "packed row too short"); assert!(luma_out.len() >= width, "luma_out row too short"); @@ -317,31 +471,51 @@ pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize, target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::v410_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::v410_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::v410_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::v410_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::v410_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::v410_to_luma_u16_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::v410_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::v410_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::v410_to_luma_u16_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::v410_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::v410_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::v410_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::v410_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::v410_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::v410_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, @@ -349,7 +523,11 @@ pub fn v410_to_luma_u16_row(packed: &[u32], luma_out: &mut [u16], width: usize, } } - scalar::v410_to_luma_u16_row(packed, luma_out, width); + if be_input { + scalar::v410_to_luma_u16_row::(packed, luma_out, width); + } else { + scalar::v410_to_luma_u16_row::(packed, luma_out, width); + } } #[cfg(all(test, feature = "std"))] @@ -367,19 +545,29 @@ mod tests { (v << 20) | (y << 10) | u } + /// Pack one V410 word in big-endian wire format. + fn pack_v410_be(u: u32, y: u32, v: u32) -> u32 { + pack_v410(u, y, v).swap_bytes() + } + /// Build a `Vec` V410 row of `width` pixels with `(U, Y, V)` /// repeated. Any positive width is valid (4:4:4, no chroma subsampling). fn solid_v410(width: usize, u: u32, y: u32, v: u32) -> std::vec::Vec { (0..width).map(|_| pack_v410(u, y, v)).collect() } + /// Build a `Vec` V410 row in big-endian wire format. + fn solid_v410_be(width: usize, u: u32, y: u32, v: u32) -> std::vec::Vec { + (0..width).map(|_| pack_v410_be(u, y, v)).collect() + } + #[test] #[should_panic(expected = "packed row too short")] fn v410_dispatcher_rejects_short_packed() { // packed buffer has only 2 elements for width=4 (needs 4). let packed = [0u32; 2]; let mut rgb = [0u8; 4 * 3]; - v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -388,7 +576,7 @@ mod tests { // output buffer has only 2 bytes for width=4 (needs 12). let packed = [0u32; 4]; let mut rgb = [0u8; 2]; - v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + v410_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -400,7 +588,7 @@ mod tests { // u8 RGB let mut rgb = [0u8; 8 * 3]; - v410_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + v410_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[0], px[1]); @@ -409,7 +597,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - v410_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + v410_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 1); assert_eq!(px[3], 0xFF); @@ -417,7 +605,15 @@ mod tests { // u16 RGB at native 10-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - v410_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + v410_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(512) <= 2); assert_eq!(px[0], px[1]); @@ -426,23 +622,75 @@ mod tests { // u16 RGBA — alpha = 1023 (10-bit opaque maximum). let mut rgba_u16 = [0u16; 8 * 4]; - v410_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + v410_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 1023); } // u8 luma — Y=512 → 128 after `>> 2`. let mut luma = [0u8; 8]; - v410_to_luma_row(&buf, &mut luma, 8, false); + v410_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, (512u32 >> 2) as u8); } // u16 luma — low-packed 10-bit Y value. let mut luma_u16 = [0u16; 8]; - v410_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + v410_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 512); } } + + #[test] + fn v410_be_and_le_dispatchers_agree() { + // BE-encoded data decoded with be_input=true must produce the same + // output as LE-encoded data decoded with be_input=false. + let le_buf = solid_v410(8, 512, 512, 512); + let be_buf = solid_v410_be(8, 512, 512, 512); + + // u8 RGB + let mut rgb_le = [0u8; 8 * 3]; + let mut rgb_be = [0u8; 8 * 3]; + v410_to_rgb_row( + &le_buf, + &mut rgb_le, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); + v410_to_rgb_row( + &be_buf, + &mut rgb_be, + 8, + ColorMatrix::Bt709, + true, + false, + true, + ); + assert_eq!( + rgb_le, rgb_be, + "LE and BE must produce identical RGB output" + ); + + // u8 luma + let mut luma_le = [0u8; 8]; + let mut luma_be = [0u8; 8]; + v410_to_luma_row(&le_buf, &mut luma_le, 8, false, false); + v410_to_luma_row(&be_buf, &mut luma_be, 8, false, true); + assert_eq!( + luma_le, luma_be, + "LE and BE must produce identical luma output" + ); + } } diff --git a/src/row/dispatch/xv36.rs b/src/row/dispatch/xv36.rs index ec40d054..81d83b03 100644 --- a/src/row/dispatch/xv36.rs +++ b/src/row/dispatch/xv36.rs @@ -11,6 +11,9 @@ //! quadruple `[U, Y, V, A]` MSB-aligned at 12-bit (low 4 bits zero //! per sample). Buffer length is `width × 4` u16 elements — no //! even-width restriction. +//! +//! `be_input = true` selects the big-endian wire variant: each u16 +//! element is byte-swapped before unpacking, matching BE XV36 streams. #[cfg(any( target_arch = "aarch64", @@ -43,7 +46,8 @@ fn xv36_packed_elems(width: usize) -> usize { /// Converts one row of XV36 to packed RGB (u8). See /// [`scalar::xv36_to_rgb_or_rgba_row`] for pixel layout / numerical -/// contract. `use_simd = false` forces scalar. +/// contract. `use_simd = false` forces scalar. `be_input = true` selects +/// the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn xv36_to_rgb_row( packed: &[u16], @@ -52,6 +56,7 @@ pub fn xv36_to_rgb_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= xv36_packed_elems(width), @@ -67,31 +72,51 @@ pub fn xv36_to_rgb_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified at runtime. - unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -99,10 +124,15 @@ pub fn xv36_to_rgb_row( } } - scalar::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + } else { + scalar::xv36_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + } } /// Converts one row of XV36 to packed RGBA (u8) with `α = 0xFF`. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn xv36_to_rgba_row( packed: &[u16], @@ -111,6 +141,7 @@ pub fn xv36_to_rgba_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= xv36_packed_elems(width), @@ -126,31 +157,51 @@ pub fn xv36_to_rgba_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -158,11 +209,16 @@ pub fn xv36_to_rgba_row( } } - scalar::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + } else { + scalar::xv36_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + } } /// Converts one row of XV36 to packed `u16` RGB at native 12-bit -/// depth (low-bit-packed, `[0, 4095]`). +/// depth (low-bit-packed, `[0, 4095]`). `be_input = true` selects +/// the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn xv36_to_rgb_u16_row( packed: &[u16], @@ -171,6 +227,7 @@ pub fn xv36_to_rgb_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= xv36_packed_elems(width), @@ -186,31 +243,51 @@ pub fn xv36_to_rgb_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } + } return; } }, @@ -218,11 +295,20 @@ pub fn xv36_to_rgb_u16_row( } } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + if be_input { + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); + } else { + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + packed, rgb_out, width, matrix, full_range, + ); + } } /// Converts one row of XV36 to packed `u16` RGBA at native 12-bit -/// depth with `α = 4095` (12-bit opaque maximum). +/// depth with `α = 4095` (12-bit opaque maximum). `be_input = true` +/// selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] pub fn xv36_to_rgba_u16_row( packed: &[u16], @@ -231,6 +317,7 @@ pub fn xv36_to_rgba_u16_row( matrix: ColorMatrix, full_range: bool, use_simd: bool, + be_input: bool, ) { assert!( packed.len() >= xv36_packed_elems(width), @@ -246,31 +333,51 @@ pub fn xv36_to_rgba_u16_row( target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::neon::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx512::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_avx2::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::x86_sse41::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } else { + unsafe { arch::wasm_simd128::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } + } return; } }, @@ -278,13 +385,28 @@ pub fn xv36_to_rgba_u16_row( } } - scalar::xv36_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + if be_input { + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); + } else { + scalar::xv36_to_rgb_u16_or_rgba_u16_row::( + packed, rgba_out, width, matrix, full_range, + ); + } } /// Extracts one row of 8-bit luma from a packed XV36 buffer. /// Y values are downshifted from 12-bit MSB-aligned to 8-bit via `>> 8`. +/// `be_input = true` selects the big-endian wire variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_simd: bool) { +pub fn xv36_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!( packed.len() >= xv36_packed_elems(width), "packed row too short" @@ -296,31 +418,51 @@ pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::xv36_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::xv36_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::xv36_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::xv36_to_luma_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::xv36_to_luma_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::xv36_to_luma_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_luma_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_luma_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::xv36_to_luma_row::(packed, luma_out, width); } + } return; } }, @@ -328,14 +470,25 @@ pub fn xv36_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize, use_s } } - scalar::xv36_to_luma_row(packed, luma_out, width); + if be_input { + scalar::xv36_to_luma_row::(packed, luma_out, width); + } else { + scalar::xv36_to_luma_row::(packed, luma_out, width); + } } /// Extracts one row of native-depth `u16` luma from a packed XV36 /// buffer (low-bit-packed: each `u16` carries the 12-bit Y value in -/// its low 12 bits). +/// its low 12 bits). `be_input = true` selects the big-endian wire +/// variant. #[cfg_attr(not(tarpaulin), inline(always))] -pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, use_simd: bool) { +pub fn xv36_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, + use_simd: bool, + be_input: bool, +) { assert!( packed.len() >= xv36_packed_elems(width), "packed row too short" @@ -347,31 +500,51 @@ pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, target_arch = "aarch64" => { if neon_available() { // SAFETY: NEON verified. - unsafe { arch::neon::xv36_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::neon::xv36_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::neon::xv36_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "x86_64" => { if avx512_available() { // SAFETY: AVX-512BW verified. - unsafe { arch::x86_avx512::xv36_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx512::xv36_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx512::xv36_to_luma_u16_row::(packed, luma_out, width); } + } return; } if avx2_available() { // SAFETY: AVX2 verified. - unsafe { arch::x86_avx2::xv36_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_avx2::xv36_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_avx2::xv36_to_luma_u16_row::(packed, luma_out, width); } + } return; } if sse41_available() { // SAFETY: SSE4.1 verified. - unsafe { arch::x86_sse41::xv36_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::x86_sse41::xv36_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::x86_sse41::xv36_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, target_arch = "wasm32" => { if simd128_available() { // SAFETY: simd128 compile-time verified. - unsafe { arch::wasm_simd128::xv36_to_luma_u16_row(packed, luma_out, width); } + if be_input { + unsafe { arch::wasm_simd128::xv36_to_luma_u16_row::(packed, luma_out, width); } + } else { + unsafe { arch::wasm_simd128::xv36_to_luma_u16_row::(packed, luma_out, width); } + } return; } }, @@ -379,7 +552,11 @@ pub fn xv36_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize, } } - scalar::xv36_to_luma_u16_row(packed, luma_out, width); + if be_input { + scalar::xv36_to_luma_u16_row::(packed, luma_out, width); + } else { + scalar::xv36_to_luma_u16_row::(packed, luma_out, width); + } } #[cfg(all(test, feature = "std"))] @@ -398,6 +575,17 @@ mod tests { [u << 4, y << 4, v << 4, a << 4] } + /// Pack one XV36 pixel in big-endian wire format. + fn pack_xv36_be(u: u16, y: u16, v: u16, a: u16) -> [u16; 4] { + let le = pack_xv36(u, y, v, a); + [ + le[0].swap_bytes(), + le[1].swap_bytes(), + le[2].swap_bytes(), + le[3].swap_bytes(), + ] + } + /// Build a `Vec` XV36 row of `width` pixels with `(U, Y, V, A)` /// repeated. Any positive width is valid (4:4:4, no chroma subsampling). fn solid_xv36(width: usize, u: u16, y: u16, v: u16) -> std::vec::Vec { @@ -405,13 +593,19 @@ mod tests { (0..width).flat_map(|_| quad).collect() } + /// Build a `Vec` XV36 row in big-endian wire format. + fn solid_xv36_be(width: usize, u: u16, y: u16, v: u16) -> std::vec::Vec { + let quad = pack_xv36_be(u, y, v, 0); + (0..width).flat_map(|_| quad).collect() + } + #[test] #[should_panic(expected = "packed row too short")] fn xv36_dispatcher_rejects_short_packed() { // packed buffer has only 2*4=8 u16 elements for width=4 (needs 4*4=16). let packed = [0u16; 8]; let mut rgb = [0u8; 4 * 3]; - xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -420,7 +614,7 @@ mod tests { // output buffer has only 2 bytes for width=4 (needs 12). let packed = [0u16; 4 * 4]; let mut rgb = [0u8; 2]; - xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false); + xv36_to_rgb_row(&packed, &mut rgb, 4, ColorMatrix::Bt709, true, false, false); } #[test] @@ -433,7 +627,7 @@ mod tests { // u8 RGB — full-range gray 0x800/0xFFF * 255 ≈ 128 let mut rgb = [0u8; 8 * 3]; - xv36_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false); + xv36_to_rgb_row(&buf, &mut rgb, 8, ColorMatrix::Bt709, true, false, false); for px in rgb.chunks(3) { assert!(px[0].abs_diff(128) <= 2); assert_eq!(px[0], px[1]); @@ -442,7 +636,7 @@ mod tests { // u8 RGBA — alpha = 0xFF let mut rgba = [0u8; 8 * 4]; - xv36_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false); + xv36_to_rgba_row(&buf, &mut rgba, 8, ColorMatrix::Bt709, true, false, false); for px in rgba.chunks(4) { assert!(px[0].abs_diff(128) <= 2); assert_eq!(px[3], 0xFF); @@ -450,7 +644,15 @@ mod tests { // u16 RGB at native 12-bit depth. let mut rgb_u16 = [0u16; 8 * 3]; - xv36_to_rgb_u16_row(&buf, &mut rgb_u16, 8, ColorMatrix::Bt709, true, false); + xv36_to_rgb_u16_row( + &buf, + &mut rgb_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgb_u16.chunks(3) { assert!(px[0].abs_diff(0x800) <= 4); assert_eq!(px[0], px[1]); @@ -459,26 +661,78 @@ mod tests { // u16 RGBA — alpha = 4095 (12-bit opaque maximum). let mut rgba_u16 = [0u16; 8 * 4]; - xv36_to_rgba_u16_row(&buf, &mut rgba_u16, 8, ColorMatrix::Bt709, true, false); + xv36_to_rgba_u16_row( + &buf, + &mut rgba_u16, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); for px in rgba_u16.chunks(4) { assert_eq!(px[3], 0x0FFF); } // u8 luma — Y=0x800 MSB-aligned → u16 value 0x8000; >> 8 = 128. let mut luma = [0u8; 8]; - xv36_to_luma_row(&buf, &mut luma, 8, false); + xv36_to_luma_row(&buf, &mut luma, 8, false, false); for &y in &luma { assert_eq!(y, 0x80u8); } // u16 luma — low-packed 12-bit Y value: 0x8000 >> 4 = 0x800. let mut luma_u16 = [0u16; 8]; - xv36_to_luma_u16_row(&buf, &mut luma_u16, 8, false); + xv36_to_luma_u16_row(&buf, &mut luma_u16, 8, false, false); for &y in &luma_u16 { assert_eq!(y, 0x800); } } + #[test] + fn xv36_be_and_le_dispatchers_agree() { + // BE-encoded data decoded with be_input=true must produce the same + // output as LE-encoded data decoded with be_input=false. + let le_buf = solid_xv36(8, 0x800, 0x800, 0x800); + let be_buf = solid_xv36_be(8, 0x800, 0x800, 0x800); + + // u8 RGB + let mut rgb_le = [0u8; 8 * 3]; + let mut rgb_be = [0u8; 8 * 3]; + xv36_to_rgb_row( + &le_buf, + &mut rgb_le, + 8, + ColorMatrix::Bt709, + true, + false, + false, + ); + xv36_to_rgb_row( + &be_buf, + &mut rgb_be, + 8, + ColorMatrix::Bt709, + true, + false, + true, + ); + assert_eq!( + rgb_le, rgb_be, + "LE and BE must produce identical RGB output" + ); + + // u8 luma + let mut luma_le = [0u8; 8]; + let mut luma_be = [0u8; 8]; + xv36_to_luma_row(&le_buf, &mut luma_le, 8, false, false); + xv36_to_luma_row(&be_buf, &mut luma_be, 8, false, true); + assert_eq!( + luma_le, luma_be, + "LE and BE must produce identical luma output" + ); + } + // ---- 32-bit width × 4 overflow guard ------------------------------------ // // XV36 packed rows consume `4 * width` u16 elements. Without the @@ -507,6 +761,7 @@ mod tests { ColorMatrix::Bt709, true, false, + false, ); } } diff --git a/src/row/scalar/ayuv64.rs b/src/row/scalar/ayuv64.rs index de7a9b7e..b5c4a12d 100644 --- a/src/row/scalar/ayuv64.rs +++ b/src/row/scalar/ayuv64.rs @@ -15,6 +15,10 @@ //! u8 output uses i32 chroma (output-range scaling keeps within i32); //! u16 output uses **i64 chroma** via `q15_chroma64` (Q15 sums //! overflow i32 at BITS=16/16, peak ~3.7e9 for BT.2020). +//! +//! `` — when `true`, each `u16` element of the input +//! slice is byte-swapped before use. This handles the `AYUV64BE` +//! big-endian wire format; `BE = false` is the standard LE path. use super::*; @@ -23,6 +27,8 @@ use super::*; /// Channel slot order: A at slot 0, Y at slot 1, U at slot 2, V at slot 3 /// (differs from VUYA which has A at slot 3). No right-shift needed — 16-bit /// native samples with no padding bits. +/// +/// Samples are passed already endian-corrected by the caller. #[cfg_attr(not(tarpaulin), inline(always))] const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) { let a = quad[0]; // slot 0 = A (source α) @@ -32,6 +38,15 @@ const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) { (u, y, v, a) // returned as (u, y, v, a) for consistency with chroma pipeline } +/// Load one AYUV64 u16 sample, applying a byte-swap for BE wire format +/// when `BE = true`. Uses target-endian aware `u16::from_be`/`u16::from_le` +/// — these are no-ops when the source byte order matches the host, so the +/// helper produces correct samples on both LE and BE hosts (e.g. s390x). +#[cfg_attr(not(tarpaulin), inline(always))] +fn load_ayuv64_u16(v: u16) -> u16 { + if BE { u16::from_be(v) } else { u16::from_le(v) } +} + // ---- u8 output (i32 chroma) -------------------------------------------- /// Shared scalar kernel for AYUV64 → packed **RGB** (`ALPHA = false, @@ -49,7 +64,11 @@ const fn extract_ayuv64(quad: &[u16]) -> (i32, i32, i32, u16) { /// - `packed.len() >= width * 4`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_rgb_or_rgba_row( +pub(crate) fn ayuv64_to_rgb_or_rgba_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u8], width: usize, @@ -70,7 +89,13 @@ pub(crate) fn ayuv64_to_rgb_or_rgba_row(packed[pix_off]), + load_ayuv64_u16::(packed[pix_off + 1]), + load_ayuv64_u16::(packed[pix_off + 2]), + load_ayuv64_u16::(packed[pix_off + 3]), + ]; + let (u, y, v, a) = extract_ayuv64(&quad); let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); @@ -94,27 +119,27 @@ pub(crate) fn ayuv64_to_rgb_or_rgba_row( packed: &[u16], rgb_out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgb_out, width, matrix, full_range); } /// Scalar AYUV64 → packed **RGBA** (4 bpp). The source A u16 at slot 0 /// of each pixel quadruple is depth-converted to u8 via `>> 8`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_rgba_row( +pub(crate) fn ayuv64_to_rgba_row( packed: &[u16], rgba_out: &mut [u8], width: usize, matrix: ColorMatrix, full_range: bool, ) { - ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_or_rgba_row::(packed, rgba_out, width, matrix, full_range); } // ---- u16 output (i64 chroma) ------------------------------------------- @@ -132,7 +157,11 @@ pub(crate) fn ayuv64_to_rgba_row( /// - `packed.len() >= width * 4`. /// - `out.len() >= width * (if ALPHA { 4 } else { 3 })`. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row< + const ALPHA: bool, + const ALPHA_SRC: bool, + const BE: bool, +>( packed: &[u16], out: &mut [u16], width: usize, @@ -152,7 +181,13 @@ pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row(packed[pix_off]), + load_ayuv64_u16::(packed[pix_off + 1]), + load_ayuv64_u16::(packed[pix_off + 2]), + load_ayuv64_u16::(packed[pix_off + 3]), + ]; + let (u, y, v, a) = extract_ayuv64(&quad); // q15_scale returns i32; q15_chroma64 handles the i32→i64 promotion // internally — pass i32 values directly (same API as q15_chroma). let u_d = q15_scale(u - bias, c_scale); @@ -180,27 +215,27 @@ pub(crate) fn ayuv64_to_rgb_u16_or_rgba_u16_row( packed: &[u16], rgb_out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgb_out, width, matrix, full_range); } /// Scalar AYUV64 → packed **RGBA u16** (4 × u16 per pixel). The source A u16 /// at slot 0 of each pixel quadruple is written direct (no conversion). #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_rgba_u16_row( +pub(crate) fn ayuv64_to_rgba_u16_row( packed: &[u16], rgba_out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { - ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); + ayuv64_to_rgb_u16_or_rgba_u16_row::(packed, rgba_out, width, matrix, full_range); } // ---- Luma extraction --------------------------------------------------- @@ -208,22 +243,30 @@ pub(crate) fn ayuv64_to_rgba_u16_row( /// Copies only the Y u16 from each AYUV64 pixel into a u8 luma plane, /// extracting the high byte via `>> 8`. Y is at slot 1 of each quadruple. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_luma_row(packed: &[u16], luma_out: &mut [u8], width: usize) { +pub(crate) fn ayuv64_to_luma_row( + packed: &[u16], + luma_out: &mut [u8], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); for x in 0..width { - luma_out[x] = (packed[x * 4 + 1] >> 8) as u8; + luma_out[x] = (load_ayuv64_u16::(packed[x * 4 + 1]) >> 8) as u8; } } /// Copies only the Y u16 from each AYUV64 pixel into a u16 luma plane, /// direct (no shift — 16-bit native). Y is at slot 1 of each quadruple. #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn ayuv64_to_luma_u16_row(packed: &[u16], luma_out: &mut [u16], width: usize) { +pub(crate) fn ayuv64_to_luma_u16_row( + packed: &[u16], + luma_out: &mut [u16], + width: usize, +) { debug_assert!(packed.len() >= width * 4, "packed row too short"); debug_assert!(luma_out.len() >= width, "luma row too short"); for x in 0..width { - luma_out[x] = packed[x * 4 + 1]; + luma_out[x] = load_ayuv64_u16::(packed[x * 4 + 1]); } } @@ -252,7 +295,7 @@ mod tests { .copied() .collect(); let mut out = vec![0u8; 4 * 3]; - ayuv64_to_rgb_row(&packed, &mut out, 4, ColorMatrix::Bt709, false); + ayuv64_to_rgb_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); // Black pixels → [0, 0, 0] assert_eq!(&out[0..3], &[0u8, 0, 0], "black pixel 0"); assert_eq!(&out[3..6], &[0u8, 0, 0], "black pixel 1"); @@ -269,7 +312,7 @@ mod tests { let p1 = pack_ayuv64(0x9999, 60160, 32768, 32768); let packed: Vec = [p0, p1].iter().flatten().copied().collect(); let mut out = vec![0u8; 2 * 4]; - ayuv64_to_rgba_row(&packed, &mut out, 2, ColorMatrix::Bt709, false); + ayuv64_to_rgba_row::(&packed, &mut out, 2, ColorMatrix::Bt709, false); assert_eq!(out[3], 0x42, "pixel 0 alpha (0x4242 >> 8 = 0x42)"); assert_eq!(out[7], 0x99, "pixel 1 alpha (0x9999 >> 8 = 0x99)"); } @@ -282,7 +325,7 @@ mod tests { let p1 = pack_ayuv64(0x9999, 60160, 32768, 32768); let packed: Vec = [p0, p1].iter().flatten().copied().collect(); let mut out = vec![0u16; 2 * 4]; - ayuv64_to_rgba_u16_row(&packed, &mut out, 2, ColorMatrix::Bt709, false); + ayuv64_to_rgba_u16_row::(&packed, &mut out, 2, ColorMatrix::Bt709, false); assert_eq!(out[3], 0x4242, "pixel 0 alpha u16 direct"); assert_eq!(out[7], 0x9999, "pixel 1 alpha u16 direct"); } @@ -295,7 +338,7 @@ mod tests { let p1 = pack_ayuv64(0, 0x4000, 0, 0); let packed: Vec = [p0, p1].iter().flatten().copied().collect(); let mut out = vec![0u8; 2]; - ayuv64_to_luma_row(&packed, &mut out, 2); + ayuv64_to_luma_row::(&packed, &mut out, 2); assert_eq!(&out[..], &[0xFFu8, 0x40], "luma u8 high-byte extract"); } @@ -307,7 +350,37 @@ mod tests { let p1 = pack_ayuv64(0, 0x1234, 0, 0); let packed: Vec = [p0, p1].iter().flatten().copied().collect(); let mut out = vec![0u16; 2]; - ayuv64_to_luma_u16_row(&packed, &mut out, 2); + ayuv64_to_luma_u16_row::(&packed, &mut out, 2); assert_eq!(&out[..], &[0xABCDu16, 0x1234], "luma u16 direct extract"); } + + #[test] + fn ayuv64_be_roundtrip_matches_byte_swapped_le() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent: on every host, `le` carries the + // intended values as LE-encoded bytes and `be` carries the same values as + // BE-encoded bytes. Both kernels should therefore decode to the same + // intended host-native values (and produce identical RGB output) on both + // LE and BE hosts. The earlier `swap_bytes` pattern only validated this + // on LE hosts and degenerated to equal-but-wrong on BE hosts. + let intended = pack_ayuv64(0xFFFF, 60160, 32768, 32768); + let le_bytes: Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le_buf: Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be_buf: Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let mut out_le = vec![0u8; 3]; + let mut out_be = vec![0u8; 3]; + ayuv64_to_rgb_row::(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false); + ayuv64_to_rgb_row::(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false); + assert_eq!( + out_le, out_be, + "AYUV64 BE scalar must match byte-swapped LE" + ); + } } diff --git a/src/row/scalar/v410.rs b/src/row/scalar/v410.rs index 26820811..51935d6c 100644 --- a/src/row/scalar/v410.rs +++ b/src/row/scalar/v410.rs @@ -3,6 +3,10 @@ //! 10-bit U / Y / V channels with 2-bit padding (see //! [`crate::frame::V410Frame`]). 4:4:4 means no chroma deinterleave //! step — each word yields a complete `(Y, U, V)` triple. +//! +//! `` — when `true`, each `u32` element of the input +//! slice is byte-swapped before field extraction. This handles the +//! `V410BE` big-endian wire format; `BE = false` is the standard LE path. use super::*; @@ -18,7 +22,7 @@ const fn extract_v410(word: u32) -> (i32, i32, i32) { // ---- u8 RGB / RGBA output ---------------------------------------------- #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v410_to_rgb_or_rgba_row( +pub(crate) fn v410_to_rgb_or_rgba_row( packed: &[u32], out: &mut [u8], width: usize, @@ -33,7 +37,12 @@ pub(crate) fn v410_to_rgb_or_rgba_row( let (y_off, y_scale, c_scale) = range_params_n::<10, 8>(full_range); let bias = chroma_bias::<10>(); - for (x, &word) in packed[..width].iter().enumerate() { + for (x, &raw) in packed[..width].iter().enumerate() { + let word = if BE { + u32::from_be(raw) + } else { + u32::from_le(raw) + }; let (u, y, v) = extract_v410(word); let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -55,7 +64,7 @@ pub(crate) fn v410_to_rgb_or_rgba_row( // ---- u16 RGB / RGBA native-depth output -------------------------------- #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row( packed: &[u32], out: &mut [u16], width: usize, @@ -72,7 +81,12 @@ pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row( let alpha_max: u16 = 0x3FF; let out_max: i32 = 0x3FF; - for (x, &word) in packed[..width].iter().enumerate() { + for (x, &raw) in packed[..width].iter().enumerate() { + let word = if BE { + u32::from_be(raw) + } else { + u32::from_le(raw) + }; let (u, y, v) = extract_v410(word); let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); @@ -94,11 +108,16 @@ pub(crate) fn v410_to_rgb_u16_or_rgba_u16_row( // ---- Luma (u8) — `>> 2` ------------------------------------------------ #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { +pub(crate) fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); for x in 0..width { - let y = (packed[x] >> 10) & 0x3FF; + let word = if BE { + u32::from_be(packed[x]) + } else { + u32::from_le(packed[x]) + }; + let y = (word >> 10) & 0x3FF; out[x] = (y >> 2) as u8; } } @@ -106,11 +125,16 @@ pub(crate) fn v410_to_luma_row(packed: &[u32], out: &mut [u8], width: usize) { // ---- Luma (u16, low-bit-packed at 10-bit) ------------------------------ #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { +pub(crate) fn v410_to_luma_u16_row(packed: &[u32], out: &mut [u16], width: usize) { debug_assert!(packed.len() >= width); debug_assert!(out.len() >= width); for x in 0..width { - let y = (packed[x] >> 10) & 0x3FF; + let word = if BE { + u32::from_be(packed[x]) + } else { + u32::from_le(packed[x]) + }; + let y = (word >> 10) & 0x3FF; out[x] = y as u16; } } @@ -138,7 +162,7 @@ mod tests { pack_v410(512, 940, 512), ]; let mut out = vec![0u8; 4 * 3]; - v410_to_rgb_or_rgba_row::(&p, &mut out, 4, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&p, &mut out, 4, ColorMatrix::Bt709, false); // Two black pixels followed by two white pixels. assert_eq!(&out[0..3], &[0u8, 0, 0]); assert_eq!(&out[3..6], &[0u8, 0, 0]); @@ -150,7 +174,7 @@ mod tests { fn v410_known_pattern_rgba_alpha_max() { let p = vec![pack_v410(512, 940, 512)]; let mut out = vec![0u8; 4]; - v410_to_rgb_or_rgba_row::(&p, &mut out, 1, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&p, &mut out, 1, ColorMatrix::Bt709, false); assert_eq!(out[3], 0xFF); } @@ -161,7 +185,7 @@ mod tests { pack_v410(0, 0x100, 0), // Y = 0x100 ]; let mut out = vec![0u8; 2]; - v410_to_luma_row(&p, &mut out, 2); + v410_to_luma_row::(&p, &mut out, 2); // 0x3FF >> 2 = 0xFF; 0x100 >> 2 = 0x40. assert_eq!(&out[..], &[0xFFu8, 0x40]); } @@ -170,7 +194,7 @@ mod tests { fn v410_luma_extract_u16_low_bit_packed() { let p = vec![pack_v410(0, 0x3FF, 0), pack_v410(0, 0x123, 0)]; let mut out = vec![0u16; 2]; - v410_to_luma_u16_row(&p, &mut out, 2); + v410_to_luma_u16_row::(&p, &mut out, 2); assert_eq!(&out[..], &[0x3FFu16, 0x123]); } @@ -178,8 +202,35 @@ mod tests { fn v410_known_pattern_rgba_u16_alpha_max() { let p = vec![pack_v410(512, 940, 512)]; let mut out = vec![0u16; 4]; - v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut out, 1, ColorMatrix::Bt709, false); + v410_to_rgb_u16_or_rgba_u16_row::(&p, &mut out, 1, ColorMatrix::Bt709, false); // 10-bit alpha max is 0x3FF (low-bit-packed). assert_eq!(out[3], 0x3FF); } + + #[test] + fn v410_be_roundtrip_matches_byte_swapped_le() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent: on every host, `le` carries the + // intended value as LE-encoded bytes and `be` carries the same value as + // BE-encoded bytes. Both kernels should therefore decode to the same + // intended host-native value (and produce identical RGB output) on both + // LE and BE hosts. The earlier `swap_bytes` pattern only validated this + // on LE hosts and degenerated to equal-but-wrong on BE hosts. + let intended = pack_v410(200, 500, 800); + let le_bytes: std::vec::Vec = intended.to_le_bytes().to_vec(); + let be_bytes: std::vec::Vec = intended.to_be_bytes().to_vec(); + let le_buf: std::vec::Vec = le_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let be_buf: std::vec::Vec = be_bytes + .chunks_exact(4) + .map(|b| u32::from_ne_bytes([b[0], b[1], b[2], b[3]])) + .collect(); + let mut out_le = vec![0u8; 3]; + let mut out_be = vec![0u8; 3]; + v410_to_rgb_or_rgba_row::(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false); + v410_to_rgb_or_rgba_row::(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "V410 BE scalar must match byte-swapped LE"); + } } diff --git a/src/row/scalar/xv36.rs b/src/row/scalar/xv36.rs index 61b82352..0c49e7ef 100644 --- a/src/row/scalar/xv36.rs +++ b/src/row/scalar/xv36.rs @@ -8,6 +8,10 @@ //! are independent. Bit extraction is `>> 4` to drop the 4 padding //! LSBs, then the standard Q15 chroma + Y pipeline at BITS=12 (i32 //! chroma — same depth as Y2xx at BITS=12). +//! +//! `` — when `true`, each `u16` element of the input +//! slice is byte-swapped before use. This handles the `XV36BE` +//! big-endian wire format; `BE = false` is the standard LE path. use super::*; @@ -15,6 +19,8 @@ use super::*; /// is padding and is not returned. Each channel is `>> 4` to drop /// the 4 padding LSBs, bringing the 12-bit MSB-aligned sample to /// the BITS=12 range `[0, 4095]`. +/// +/// Samples are passed already endian-corrected by the caller. #[cfg_attr(not(tarpaulin), inline(always))] const fn extract_xv36(quad: &[u16]) -> (i32, i32, i32) { let u = (quad[0] >> 4) as i32; @@ -24,10 +30,19 @@ const fn extract_xv36(quad: &[u16]) -> (i32, i32, i32) { (u, y, v) } +/// Load one XV36 u16 sample, applying a byte-swap for BE wire format +/// when `BE = true`. Uses target-endian aware `u16::from_be`/`u16::from_le` +/// — these are no-ops when the source byte order matches the host, so the +/// helper produces correct samples on both LE and BE hosts (e.g. s390x). +#[cfg_attr(not(tarpaulin), inline(always))] +fn load_xv36_u16(v: u16) -> u16 { + if BE { u16::from_be(v) } else { u16::from_le(v) } +} + // ---- u8 RGB / RGBA output ---------------------------------------------- #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn xv36_to_rgb_or_rgba_row( +pub(crate) fn xv36_to_rgb_or_rgba_row( packed: &[u16], out: &mut [u8], width: usize, @@ -43,8 +58,14 @@ pub(crate) fn xv36_to_rgb_or_rgba_row( let bias = chroma_bias::<12>(); for x in 0..width { - let quad = &packed[x * 4..x * 4 + 4]; - let (u, y, v) = extract_xv36(quad); + let base = x * 4; + let quad = [ + load_xv36_u16::(packed[base]), + load_xv36_u16::(packed[base + 1]), + load_xv36_u16::(packed[base + 2]), + load_xv36_u16::(packed[base + 3]), + ]; + let (u, y, v) = extract_xv36(&quad); let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); @@ -65,7 +86,7 @@ pub(crate) fn xv36_to_rgb_or_rgba_row( // ---- u16 RGB / RGBA native-depth output -------------------------------- #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row( +pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row( packed: &[u16], out: &mut [u16], width: usize, @@ -83,8 +104,14 @@ pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row( let out_max: i32 = 0x0FFF; for x in 0..width { - let quad = &packed[x * 4..x * 4 + 4]; - let (u, y, v) = extract_xv36(quad); + let base = x * 4; + let quad = [ + load_xv36_u16::(packed[base]), + load_xv36_u16::(packed[base + 1]), + load_xv36_u16::(packed[base + 2]), + load_xv36_u16::(packed[base + 3]), + ]; + let (u, y, v) = extract_xv36(&quad); let u_d = q15_scale(u - bias, c_scale); let v_d = q15_scale(v - bias, c_scale); let r_chroma = q15_chroma(coeffs.r_u(), u_d, coeffs.r_v(), v_d); @@ -105,11 +132,11 @@ pub(crate) fn xv36_to_rgb_u16_or_rgba_u16_row( // ---- Luma (u8) — `>> 8` (drops 4 padding bits + 4 LSBs) ---------------- #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { +pub(crate) fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); for x in 0..width { - let y = packed[x * 4 + 1] >> 8; + let y = load_xv36_u16::(packed[x * 4 + 1]) >> 8; out[x] = y as u8; } } @@ -117,11 +144,11 @@ pub(crate) fn xv36_to_luma_row(packed: &[u16], out: &mut [u8], width: usize) { // ---- Luma (u16, low-bit-packed at 12-bit) ------------------------------ #[cfg_attr(not(tarpaulin), inline(always))] -pub(crate) fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { +pub(crate) fn xv36_to_luma_u16_row(packed: &[u16], out: &mut [u16], width: usize) { debug_assert!(packed.len() >= width * 4); debug_assert!(out.len() >= width); for x in 0..width { - let y = packed[x * 4 + 1] >> 4; + let y = load_xv36_u16::(packed[x * 4 + 1]) >> 4; out[x] = y; } } @@ -149,7 +176,7 @@ mod tests { let p3 = pack_xv36(2048, 3760, 2048, 0); let packed: Vec = [p0, p1, p2, p3].iter().flatten().copied().collect(); let mut out = vec![0u8; 4 * 3]; - xv36_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&packed, &mut out, 4, ColorMatrix::Bt709, false); assert_eq!(&out[0..3], &[0u8, 0, 0]); assert_eq!(&out[3..6], &[0u8, 0, 0]); assert_eq!(&out[6..9], &[255u8, 255, 255]); @@ -161,7 +188,7 @@ mod tests { let p = pack_xv36(2048, 3760, 2048, 0); let packed: Vec = p.into_iter().collect(); let mut out = vec![0u8; 4]; - xv36_to_rgb_or_rgba_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); // X = padding; RGBA forces α=0xFF regardless of source A byte. assert_eq!(out[3], 0xFF); } @@ -172,7 +199,7 @@ mod tests { let p = pack_xv36(2048, 3760, 2048, 0xFFF); let packed: Vec = p.into_iter().collect(); let mut out = vec![0u8; 4]; - xv36_to_rgb_or_rgba_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); assert_eq!(out[3], 0xFF); } @@ -185,7 +212,7 @@ mod tests { .copied() .collect(); let mut out = vec![0u8; 2]; - xv36_to_luma_row(&packed, &mut out, 2); + xv36_to_luma_row::(&packed, &mut out, 2); assert_eq!(&out[..], &[0xFFu8, 0x10]); } @@ -197,7 +224,7 @@ mod tests { .copied() .collect(); let mut out = vec![0u16; 2]; - xv36_to_luma_u16_row(&packed, &mut out, 2); + xv36_to_luma_u16_row::(&packed, &mut out, 2); assert_eq!(&out[..], &[0xFFFu16, 0x123]); } @@ -206,8 +233,35 @@ mod tests { let p = pack_xv36(2048, 3760, 2048, 0xFFF); let packed: Vec = p.into_iter().collect(); let mut out = vec![0u16; 4]; - xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); + xv36_to_rgb_u16_or_rgba_u16_row::(&packed, &mut out, 1, ColorMatrix::Bt709, false); // 12-bit alpha max = 0x0FFF; X = padding so source A byte is ignored. assert_eq!(out[3], 0x0FFF); } + + #[test] + fn xv36_be_roundtrip_matches_byte_swapped_le() { + // Construct LE/BE buffers from raw bytes via `to_le_bytes` / `to_be_bytes` + // so semantics are host-independent: on every host, `le` carries the + // intended values as LE-encoded bytes and `be` carries the same values as + // BE-encoded bytes. Both kernels should therefore decode to the same + // intended host-native values (and produce identical RGB output) on both + // LE and BE hosts. The earlier `swap_bytes` pattern only validated this + // on LE hosts and degenerated to equal-but-wrong on BE hosts. + let intended = pack_xv36(1024, 2048, 512, 0); + let le_bytes: Vec = intended.iter().flat_map(|v| v.to_le_bytes()).collect(); + let be_bytes: Vec = intended.iter().flat_map(|v| v.to_be_bytes()).collect(); + let le_buf: Vec = le_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let be_buf: Vec = be_bytes + .chunks_exact(2) + .map(|b| u16::from_ne_bytes([b[0], b[1]])) + .collect(); + let mut out_le = vec![0u8; 3]; + let mut out_be = vec![0u8; 3]; + xv36_to_rgb_or_rgba_row::(&le_buf, &mut out_le, 1, ColorMatrix::Bt709, false); + xv36_to_rgb_or_rgba_row::(&be_buf, &mut out_be, 1, ColorMatrix::Bt709, false); + assert_eq!(out_le, out_be, "XV36 BE scalar must match byte-swapped LE"); + } } diff --git a/src/sinker/mixed/ayuv64.rs b/src/sinker/mixed/ayuv64.rs index 6ab55456..afba3d5e 100644 --- a/src/sinker/mixed/ayuv64.rs +++ b/src/sinker/mixed/ayuv64.rs @@ -252,6 +252,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -263,6 +264,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -290,6 +292,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -308,6 +311,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -328,7 +332,15 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { w, h, )?; - ayuv64_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + ayuv64_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( @@ -370,6 +382,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { row.matrix(), row.full_range(), use_simd, + false, ); } @@ -393,6 +406,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { row.matrix(), row.full_range(), use_simd, + false, ); // Strategy A+ combo (u16): RGBA u16 also attached — derive from the @@ -429,6 +443,7 @@ impl PixelSink for MixedSinker<'_, Ayuv64> { row.matrix(), row.full_range(), use_simd, + false, ); } diff --git a/src/sinker/mixed/tests/ayuv64.rs b/src/sinker/mixed/tests/ayuv64.rs index 12bcf366..3f0e4709 100644 --- a/src/sinker/mixed/tests/ayuv64.rs +++ b/src/sinker/mixed/tests/ayuv64.rs @@ -856,14 +856,14 @@ fn ayuv64_strategy_a_plus_matches_independent_kernel() { let row_off_packed = r * width * 4; let row_off_rgb = r * width * 3; let row_off_rgba = r * width * 4; - crate::row::scalar::ayuv64_to_rgb_row( + crate::row::scalar::ayuv64_to_rgb_row::( &packed[row_off_packed..row_off_packed + width * 4], &mut inline_rgb[row_off_rgb..row_off_rgb + width * 3], width, matrix, full_range, ); - crate::row::scalar::ayuv64_to_rgba_row( + crate::row::scalar::ayuv64_to_rgba_row::( &packed[row_off_packed..row_off_packed + width * 4], &mut inline_rgba[row_off_rgba..row_off_rgba + width * 4], width, @@ -931,14 +931,14 @@ fn ayuv64_strategy_a_plus_u16_matches_independent_kernel() { let row_off_packed = r * width * 4; let row_off_rgb = r * width * 3; let row_off_rgba = r * width * 4; - crate::row::scalar::ayuv64_to_rgb_u16_row( + crate::row::scalar::ayuv64_to_rgb_u16_row::( &packed[row_off_packed..row_off_packed + width * 4], &mut inline_rgb[row_off_rgb..row_off_rgb + width * 3], width, matrix, full_range, ); - crate::row::scalar::ayuv64_to_rgba_u16_row( + crate::row::scalar::ayuv64_to_rgba_u16_row::( &packed[row_off_packed..row_off_packed + width * 4], &mut inline_rgba[row_off_rgba..row_off_rgba + width * 4], width, diff --git a/src/sinker/mixed/tests/packed_rgb_f16.rs b/src/sinker/mixed/tests/packed_rgb_f16.rs index 574e482c..de173084 100644 --- a/src/sinker/mixed/tests/packed_rgb_f16.rs +++ b/src/sinker/mixed/tests/packed_rgb_f16.rs @@ -330,7 +330,17 @@ fn rgbf16_simd_matches_scalar_with_random_input() { /// driven by miri on s390x / powerpc64; gating it out of miri (per the /// codex 4th-pass finding) would skip exactly the host where BE corruption /// would surface. +/// +/// Re-gated on miri because the fixture builder calls `half::f16::from_f32`, +/// which on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` that miri rejects. BE-host miri (s390x / +/// powerpc64) covers the byte-swap correctness via the f32 LE-encoded +/// regression tests in this module instead. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn rgbf16_sinker_le_encoded_frame_decodes_correctly() { let vals_f32 = [0.5f32, 1.5, -0.25, 100.0]; let intended: Vec = (0..16 * 4 * 3) diff --git a/src/sinker/mixed/tests/planar_gbr_float.rs b/src/sinker/mixed/tests/planar_gbr_float.rs index a8132480..7d1d9eb2 100644 --- a/src/sinker/mixed/tests/planar_gbr_float.rs +++ b/src/sinker/mixed/tests/planar_gbr_float.rs @@ -1024,10 +1024,17 @@ fn gbrapf32_sinker_le_encoded_frame_decodes_correctly() { /// LE-encoded byte contract regression for [`Gbrpf16`]. /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1104,10 +1111,17 @@ fn gbrpf16_sinker_le_encoded_frame_decodes_correctly() { /// LE-encoded byte contract regression for [`Gbrapf16`] (lossless RGBA /// pass-through, including the α plane). /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1178,10 +1192,17 @@ fn gbrapf16_sinker_le_encoded_frame_decodes_correctly() { /// `widen_f16_be_to_host_f32::` would interpret byte-swapped bits as /// host-native f16 and decode to wildly wrong f32 values. /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1247,10 +1268,17 @@ fn gbrpf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { /// (`with_rgba_f32`, including the α plane). Exercises the four-plane f16 → /// f32 widen step — same bit-normalise-first contract as the no-α variant. /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1310,10 +1338,17 @@ fn gbrapf16_sinker_widen_path_le_encoded_frame_decodes_correctly() { /// the kernel byte-swap a no-op on every host. Vacuous on LE; would catch /// the double-swap on BE. /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrpf16_sinker_widen_path_u16_and_u8_le_encoded_frame_decodes_correctly() { let w = 16usize; let h = 4usize; @@ -1595,10 +1630,17 @@ fn gbrapf32_strategy_a_plus_le_encoded_u16_alpha_decodes_correctly() { /// `widen_and_scatter` helper, so this test guards against the /// post-widen routing flag being wrong). /// -/// Forces `with_simd(false)` so the test is miri-safe and runs on BE-host -/// miri CI. See the `gbrpf32_sinker_le_encoded_frame_decodes_correctly` -/// docstring for the rationale. +/// Forces `with_simd(false)` so the kernel runs purely scalar — no SIMD +/// intrinsics — but the fixture builder calls `half::f16::from_f32`, which +/// on aarch64 / x86 / x86_64 with `target_feature = "fp16"` (or F16C) +/// expands to inline `asm!` unsupported by miri. Miri-gated on every +/// target — BE-host miri (s390x / powerpc64) covers the byte-swap +/// correctness via the f32 LE-encoded regressions in this module. #[test] +#[cfg_attr( + miri, + ignore = "half::f16::from_f32 uses inline asm (fcvt) unsupported by Miri" +)] fn gbrapf16_strategy_a_plus_post_widen_alpha_decodes_correctly() { let w = 15usize; let h = 3usize; diff --git a/src/sinker/mixed/v410.rs b/src/sinker/mixed/v410.rs index c1c865db..09adc483 100644 --- a/src/sinker/mixed/v410.rs +++ b/src/sinker/mixed/v410.rs @@ -201,6 +201,7 @@ impl PixelSink for MixedSinker<'_, V410> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 10-bit Y values at native depth (low-bit-packed @@ -211,6 +212,7 @@ impl PixelSink for MixedSinker<'_, V410> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -231,6 +233,7 @@ impl PixelSink for MixedSinker<'_, V410> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -251,6 +254,7 @@ impl PixelSink for MixedSinker<'_, V410> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -281,6 +285,7 @@ impl PixelSink for MixedSinker<'_, V410> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -297,7 +302,15 @@ impl PixelSink for MixedSinker<'_, V410> { w, h, )?; - v410_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + v410_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row( diff --git a/src/sinker/mixed/xv36.rs b/src/sinker/mixed/xv36.rs index e773fab9..b7579d4e 100644 --- a/src/sinker/mixed/xv36.rs +++ b/src/sinker/mixed/xv36.rs @@ -209,6 +209,7 @@ impl PixelSink for MixedSinker<'_, Xv36> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } // Luma u16 — extract 12-bit Y values at native depth (shift >> 4 @@ -219,6 +220,7 @@ impl PixelSink for MixedSinker<'_, Xv36> { &mut buf[one_plane_start..one_plane_end], w, use_simd, + false, ); } @@ -239,6 +241,7 @@ impl PixelSink for MixedSinker<'_, Xv36> { row.matrix(), row.full_range(), use_simd, + false, ); } else if want_rgb_u16 { let rgb_u16_buf = rgb_u16.as_deref_mut().unwrap(); @@ -259,6 +262,7 @@ impl PixelSink for MixedSinker<'_, Xv36> { row.matrix(), row.full_range(), use_simd, + false, ); if want_rgba_u16 { // Strategy A u16 fan-out — derive RGBA from the just-computed @@ -289,6 +293,7 @@ impl PixelSink for MixedSinker<'_, Xv36> { row.matrix(), row.full_range(), use_simd, + false, ); return Ok(()); } @@ -305,7 +310,15 @@ impl PixelSink for MixedSinker<'_, Xv36> { w, h, )?; - xv36_to_rgb_row(packed, rgb_row, w, row.matrix(), row.full_range(), use_simd); + xv36_to_rgb_row( + packed, + rgb_row, + w, + row.matrix(), + row.full_range(), + use_simd, + false, + ); if let Some(hsv) = hsv.as_mut() { rgb_to_hsv_row(