From 1fe923446e6c30267f2632267bf903b866ff8dd5 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 11:38:32 +1200 Subject: [PATCH 1/6] Ship 8b-2c: Yuva420p family u16 RGBA SIMD across all 5 backends MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds native-depth u16 RGBA SIMD across NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128 for the high-bit YUVA 4:2:0 family — Yuva420p9 / Yuva420p10 (BITS-generic) and Yuva420p16 (16-bit). Wires the 3 u16 RGBA dispatchers in src/row/mod.rs that landed as scalar-only stubs in PR #35 (Ship 8b-2a), completing the Yuva420p source-side family across u8 RGBA (8b-2b, PR #36) and u16 RGBA (this PR). Note: 8-bit Yuva420p has no u16 RGBA path — its u8 alpha source doesn't widen meaningfully into a u16 alpha output, and the public API doesn't expose it. ## Changes - **5 SIMD backends** — each gain a third const-generic `ALPHA_SRC: bool` added to the existing `` (or `` for 16-bit) u16 RGBA templates across 2 kernel families: - high-bit BITS-generic: `yuv_420p_n_to_rgb_or_rgba_u16_row` - 16-bit: `yuv_420p16_to_rgb_or_rgba_u16_row` When `ALPHA_SRC = true`: - **High-bit (Yuva420p9/10)**: alpha is loaded + AND-masked with `bits_mask::()` (same hardening as Y/U/V) and stored at native bit depth — no shift since both source and output are at BITS. - **16-bit (Yuva420p16)**: alpha is loaded directly as full-range u16 — no mask, no shift. Existing no-alpha / opaque-alpha wrappers stay backward-compat by passing `ALPHA_SRC = false, None`. AVX-512 16-bit's `write_rgba_u16_32` helper broadcasts a single 128-bit alpha lane, so the ALPHA_SRC = true branch inlines four `write_rgba_u16_8` calls with per-quarter alpha extraction instead. - **3 u16 RGBA dispatchers wired** in `src/row/mod.rs` (`yuva420p9_to_rgba_u16_row`, `yuva420p10_to_rgba_u16_row`, `yuva420p16_to_rgba_u16_row`) — replace the prior `let _ = use_simd` stubs with the standard `cfg_select!` per-arch route block, mirroring the Yuva444p10 u16 dispatchers' patterns from PR #34. - **Per-backend u16 RGBA equivalence tests** — 25 new `#[test]` functions across the 5 backend test modules (5 NEON, 5 each on SSE4.1 / AVX2 / AVX-512 / wasm simd128). Each new x86 test early-returns on `is_x86_feature_detected!` to satisfy CI sanitizer / Miri / non-feature-flagged runners. Pseudo-random alpha flushes lane-order corruption that solid alpha would mask. - Compile-time `const { assert!(!ALPHA_SRC || ALPHA) }` retained on every shared template — source alpha requires RGBA output. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/arch/neon.rs | 190 +++++++++++++++++++--- src/row/arch/neon/tests.rs | 165 +++++++++++++++++++ src/row/arch/wasm_simd128.rs | 180 ++++++++++++++++++--- src/row/arch/wasm_simd128/tests.rs | 143 +++++++++++++++++ src/row/arch/x86_avx2.rs | 196 ++++++++++++++++++++--- src/row/arch/x86_avx2/tests.rs | 145 +++++++++++++++++ src/row/arch/x86_avx512.rs | 244 +++++++++++++++++++++++++---- src/row/arch/x86_avx512/tests.rs | 145 +++++++++++++++++ src/row/arch/x86_sse41.rs | 191 ++++++++++++++++++---- src/row/arch/x86_sse41/tests.rs | 145 +++++++++++++++++ src/row/mod.rs | 192 +++++++++++++++++++++-- 11 files changed, 1791 insertions(+), 145 deletions(-) diff --git a/src/row/arch/neon.rs b/src/row/arch/neon.rs index e1f8fdf8..f1726268 100644 --- a/src/row/arch/neon.rs +++ b/src/row/arch/neon.rs @@ -676,8 +676,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -702,16 +702,61 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, ); } } -/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` -/// writes RGBA quads via `vst4q_u16` with constant alpha -/// `(1 << BITS) - 1`. +/// NEON YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed +/// RGBA with the per-pixel alpha element **sourced from `a_src`** +/// (already at the source's native bit depth — no depth conversion) +/// instead of being the opaque maximum `(1 << BITS) - 1`. Same +/// numerical contract as [`yuv_420p_n_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "neon")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, + ); + } +} + +/// Shared NEON high-bit YUV 4:2:0 → native-depth `u16` kernel for +/// [`yuv_420p_n_to_rgb_u16_row`] (`ALPHA = false, ALPHA_SRC = false`, +/// `vst3q_u16`), [`yuv_420p_n_to_rgba_u16_row`] (`ALPHA = true, +/// ALPHA_SRC = false`, `vst4q_u16` with constant alpha +/// `(1 << BITS) - 1`) and [`yuv_420p_n_to_rgba_u16_with_alpha_src_row`] +/// (`ALPHA = true, ALPHA_SRC = true`, `vst4q_u16` with the alpha lane +/// loaded from `a_src` and masked to native bit depth — no shift since +/// both the source alpha and the u16 output element are at the same +/// native bit depth). /// /// # Safety /// @@ -719,25 +764,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( /// 2. `width & 1 == 0`. /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 5. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -819,8 +877,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); @@ -2907,8 +2984,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -2931,15 +3008,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// NEON 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with +/// the per-pixel alpha element **sourced from `a_src`** (full-range +/// u16, no mask, no shift) instead of being constant `0xFFFF`. Same +/// numerical contract as [`yuv_420p16_to_rgba_u16_row`] for R/G/B. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "neon")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared NEON 16-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `vst3q_u16`; `ALPHA = true` -/// writes RGBA quads via `vst4q_u16` with constant alpha `0xFFFF`. +/// - `ALPHA = false, ALPHA_SRC = false`: `vst3q_u16`. +/// - `ALPHA = true, ALPHA_SRC = false`: `vst4q_u16` with constant +/// alpha `0xFFFF`. +/// - `ALPHA = true, ALPHA_SRC = true`: `vst4q_u16` with the alpha +/// lane loaded directly from `a_src` (full-range u16, no mask). /// /// # Safety /// @@ -2948,23 +3067,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. #[inline] #[target_feature(enable = "neon")] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3074,13 +3202,23 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( ); if ALPHA { + let (a_lo_v, a_hi_v) = if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies the + // wrapper passed Some(_), validated by debug_assert above. + // 16-bit alpha is full-range u16 — load 16 lanes directly, + // no mask or shift needed. + let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); + (vld1q_u16(a_ptr.add(x)), vld1q_u16(a_ptr.add(x + 8))) + } else { + (alpha_u16, alpha_u16) + }; vst4q_u16( out.as_mut_ptr().add(x * 4), - uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, alpha_u16), + uint16x8x4_t(r_lo_u16, g_lo_u16, b_lo_u16, a_lo_v), ); vst4q_u16( out.as_mut_ptr().add(x * 4 + 32), - uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, alpha_u16), + uint16x8x4_t(r_hi_u16, g_hi_u16, b_hi_u16, a_hi_v), ); } else { vst3q_u16( @@ -3101,7 +3239,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( let tail_v = &v_half[x / 2..width / 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - if ALPHA { + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). + let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p16_to_rgba_u16_row( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/neon/tests.rs b/src/row/arch/neon/tests.rs index 4a5f27c9..6d9eff83 100644 --- a/src/row/arch/neon/tests.rs +++ b/src/row/arch/neon/tests.rs @@ -3234,3 +3234,168 @@ fn neon_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { ); } } + +// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ---- +// +// Mirrors the 4:4:4 u16 alpha-source pattern for the 4:2:0 family — +// high-bit BITS-generic (Yuva420p9 / Yuva420p10) and 16-bit +// (Yuva420p16). 8-bit Yuva420p has no u16 RGBA path. Pseudo-random +// alpha + per-arch direct kernel call so `vst4q_u16` lane order is +// exercised regardless of the dispatcher tier on the runner. + +fn check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_neon, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +fn check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = p16_plane_neon(width, 37); + let u = p16_plane_neon(width / 2, 53); + let v = p16_plane_neon(width / 2, 71); + let a_src = p16_plane_neon(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_neon = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_neon, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_neon, + "NEON Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva420p_n_rgba_u16_matches_scalar_all_bits() { + // BITS = 9, 10 — full matrix sweep × natural width. + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89); + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva420p_n_rgba_u16_matches_scalar_widths() { + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89); + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva420p_n_rgba_u16_matches_scalar_random_alpha() { + // Different alpha seeds — confirms alpha lane order through + // `vst4q_u16` doesn't collide with R/G/B. + for seed in [13usize, 41, 127, 211] { + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<10>( + 16, + ColorMatrix::Bt601, + false, + seed, + ); + check_yuv420p_n_u16_neon_rgba_with_alpha_src_equivalence::<9>( + 34, + ColorMatrix::Bt2020Ncl, + true, + seed, + ); + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva420p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(16, m, full, 89); + } + } +} + +#[test] +#[cfg_attr(miri, ignore = "NEON SIMD intrinsics unsupported by Miri")] +fn neon_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() { + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89); + } + for seed in [13usize, 41, 127, 211] { + check_yuv420p16_u16_neon_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed); + } +} diff --git a/src/row/arch/wasm_simd128.rs b/src/row/arch/wasm_simd128.rs index 0d5fc17c..9992274e 100644 --- a/src/row/arch/wasm_simd128.rs +++ b/src/row/arch/wasm_simd128.rs @@ -613,8 +613,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -638,16 +638,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// wasm simd128 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** +/// packed RGBA with the per-pixel alpha element **sourced from +/// `a_src`** (masked to BITS, no shift) instead of being the opaque +/// maximum `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "simd128")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared wasm simd128 high-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with -/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = false, ALPHA_SRC = false`: 2× `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: 2× `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = true, ALPHA_SRC = true`: 2× `write_rgba_u16_8` with the +/// alpha lanes loaded from `a_src` and masked to BITS. /// /// # Safety /// @@ -656,25 +697,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 5. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -745,9 +799,21 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); @@ -3465,8 +3537,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -3489,16 +3561,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// wasm simd128 16-bit YUVA 4:2:0 → **native-depth `u16`** packed +/// RGBA with the per-pixel alpha element **sourced from `a_src`** +/// (full-range u16, no mask, no shift) instead of being constant +/// `0xFFFF`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "simd128")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared wasm simd128 16-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with -/// constant alpha `0xFFFF`. +/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the +/// alpha lane loaded from `a_src` (full-range u16). /// /// # Safety /// @@ -3507,23 +3620,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. #[inline] #[target_feature(enable = "simd128")] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3615,7 +3737,15 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( ); if ALPHA { - write_rgba_u16_8(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + let a_v = if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies the + // wrapper passed Some(_), validated by debug_assert above. + // 16-bit alpha is full-range u16 — load 8 lanes directly. + v128_load(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast()) + } else { + alpha_u16 + }; + write_rgba_u16_8(r_u16, g_u16, b_u16, a_v, out.as_mut_ptr().add(x * 4)); } else { write_rgb_u16_8(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); } @@ -3628,7 +3758,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( let tail_v = &v_half[x / 2..width / 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - if ALPHA { + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). + let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p16_to_rgba_u16_row( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/wasm_simd128/tests.rs b/src/row/arch/wasm_simd128/tests.rs index 1a6e36a3..1bfbf67b 100644 --- a/src/row/arch/wasm_simd128/tests.rs +++ b/src/row/arch/wasm_simd128/tests.rs @@ -2777,3 +2777,146 @@ fn simd128_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { ); } } + +// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ---- + +fn check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_wasm = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_wasm, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +fn check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = p16_plane_wasm(width, 37); + let u = p16_plane_wasm(width / 2, 53); + let v = p16_plane_wasm(width / 2, 71); + let a_src = p16_plane_wasm(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_wasm = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_wasm, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_wasm, + "wasm simd128 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn simd128_yuva420p_n_rgba_u16_matches_scalar_all_bits() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89); + check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn simd128_yuva420p_n_rgba_u16_matches_scalar_widths() { + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Bt601, + false, + 89, + ); + check_yuv420p_n_u16_simd128_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); + } +} + +#[test] +fn simd128_yuva420p16_rgba_u16_matches_scalar_all_matrices() { + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(16, m, full, 89); + } + } +} + +#[test] +fn simd128_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() { + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89); + } + for seed in [13usize, 41, 127, 211] { + check_yuv420p16_u16_simd128_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed); + } +} diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 9d893fbf..425609b3 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -662,8 +662,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -687,16 +687,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// AVX2 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed RGBA +/// with the per-pixel alpha element **sourced from `a_src`** (masked +/// to BITS, no shift) instead of being the opaque maximum +/// `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx2")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared AVX2 high-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via 4× `write_rgb_u16_8` per -/// 32-pixel block; `ALPHA = true` writes RGBA quads via 4× -/// `write_rgba_u16_8` with constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = false, ALPHA_SRC = false`: 4× `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: 4× `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = true, ALPHA_SRC = true`: 4× `write_rgba_u16_8` with the +/// alpha lanes loaded from `a_src` and masked to BITS. /// /// # Safety /// @@ -705,25 +746,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 5. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -812,33 +866,51 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row(a_lo), + _mm256_castsi256_si128(a_hi), + _mm256_extracti128_si256::<1>(a_hi), + ) + } else { + (alpha_u16, alpha_u16, alpha_u16, alpha_u16) + }; let dst = out.as_mut_ptr().add(x * 4); write_rgba_u16_8( _mm256_castsi256_si128(r_lo), _mm256_castsi256_si128(g_lo), _mm256_castsi256_si128(b_lo), - alpha_u16, + a0_v, dst, ); write_rgba_u16_8( _mm256_extracti128_si256::<1>(r_lo), _mm256_extracti128_si256::<1>(g_lo), _mm256_extracti128_si256::<1>(b_lo), - alpha_u16, + a1_v, dst.add(32), ); write_rgba_u16_8( _mm256_castsi256_si128(r_hi), _mm256_castsi256_si128(g_hi), _mm256_castsi256_si128(b_hi), - alpha_u16, + a2_v, dst.add(64), ); write_rgba_u16_8( _mm256_extracti128_si256::<1>(r_hi), _mm256_extracti128_si256::<1>(g_hi), _mm256_extracti128_si256::<1>(b_hi), - alpha_u16, + a3_v, dst.add(96), ); } else { @@ -878,7 +950,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); @@ -3743,8 +3821,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -3767,15 +3845,56 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// AVX2 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with +/// the per-pixel alpha element **sourced from `a_src`** (full-range +/// u16, no mask, no shift) instead of being constant `0xFFFF`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx2")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared AVX2 16-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA -/// quads with constant alpha `0xFFFF`. +/// - `ALPHA = false, ALPHA_SRC = false`: 2× `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: 2× `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// - `ALPHA = true, ALPHA_SRC = true`: 2× `write_rgba_u16_8` with the +/// alpha lanes loaded from `a_src` (full-range u16). /// /// # Safety /// @@ -3784,23 +3903,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. #[inline] #[target_feature(enable = "avx2")] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3896,19 +4024,33 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( // Write 16 pixels via two 8-pixel helper calls. if ALPHA { + let (a_lo_v, a_hi_v) = if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies the + // wrapper passed Some(_), validated by debug_assert above. + // 16-bit alpha is full-range u16 — load 16 lanes (one + // __m256i = 32 bytes), split into two 128-bit halves. + let a_vec = + _mm256_loadu_si256(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast()); + ( + _mm256_castsi256_si128(a_vec), + _mm256_extracti128_si256::<1>(a_vec), + ) + } else { + (alpha_u16, alpha_u16) + }; let dst = out.as_mut_ptr().add(x * 4); write_rgba_u16_8( _mm256_castsi256_si128(r_u16), _mm256_castsi256_si128(g_u16), _mm256_castsi256_si128(b_u16), - alpha_u16, + a_lo_v, dst, ); write_rgba_u16_8( _mm256_extracti128_si256::<1>(r_u16), _mm256_extracti128_si256::<1>(g_u16), _mm256_extracti128_si256::<1>(b_u16), - alpha_u16, + a_hi_v, dst.add(32), ); } else { @@ -3936,7 +4078,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( let tail_v = &v_half[x / 2..width / 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - if ALPHA { + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). + let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p16_to_rgba_u16_row( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx2/tests.rs b/src/row/arch/x86_avx2/tests.rs index d574487a..01ce31ff 100644 --- a/src/row/arch/x86_avx2/tests.rs +++ b/src/row/arch/x86_avx2/tests.rs @@ -3043,3 +3043,148 @@ fn avx2_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { ); } } + +// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ---- + +fn check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +fn check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = p16_plane_avx2(width, 37); + let u = p16_plane_avx2(width / 2, 53); + let v = p16_plane_avx2(width / 2, 71); + let a_src = p16_plane_avx2(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX2 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx2_yuva420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(32, m, full, 89); + check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(32, m, full, 89); + } + } +} + +#[test] +fn avx2_yuva420p_n_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [32usize, 34, 46, 62, 1920, 1922] { + check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89); + check_yuv420p_n_u16_avx2_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn avx2_yuva420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(16, m, full, 89); + } + } +} + +#[test] +fn avx2_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() { + if !std::arch::is_x86_feature_detected!("avx2") { + return; + } + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89); + } + for seed in [13usize, 41, 127, 211] { + check_yuv420p16_u16_avx2_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed); + } +} diff --git a/src/row/arch/x86_avx512.rs b/src/row/arch/x86_avx512.rs index 71de14f1..f9afa112 100644 --- a/src/row/arch/x86_avx512.rs +++ b/src/row/arch/x86_avx512.rs @@ -676,8 +676,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -701,16 +701,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// AVX-512 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed +/// RGBA with the per-pixel alpha element **sourced from `a_src`** +/// (masked to BITS, no shift) instead of being the opaque maximum +/// `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared AVX-512 high-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via 8× `write_quarter` per -/// 64-pixel block; `ALPHA = true` writes RGBA quads via 8× -/// `write_quarter_rgba` with constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = false, ALPHA_SRC = false`: 8× `write_quarter` per 64-pixel block. +/// - `ALPHA = true, ALPHA_SRC = false`: 8× `write_quarter_rgba` with +/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = true, ALPHA_SRC = true`: 8× `write_quarter_rgba` with +/// the alpha quarters extracted from `a_src` (masked to BITS). /// /// # Safety /// @@ -719,25 +760,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 5. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -829,15 +883,41 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row(a_lo), + _mm512_extracti32x4_epi32::<1>(a_lo), + _mm512_extracti32x4_epi32::<2>(a_lo), + _mm512_extracti32x4_epi32::<3>(a_lo), + _mm512_extracti32x4_epi32::<0>(a_hi), + _mm512_extracti32x4_epi32::<1>(a_hi), + _mm512_extracti32x4_epi32::<2>(a_hi), + _mm512_extracti32x4_epi32::<3>(a_hi), + ) + } else { + ( + alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, alpha_u16, + ) + }; let dst = out.as_mut_ptr().add(x * 4); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 0, dst); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 1, dst.add(32)); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 2, dst.add(64)); - write_quarter_rgba(r_lo, g_lo, b_lo, alpha_u16, 3, dst.add(96)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 0, dst.add(128)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 1, dst.add(160)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 2, dst.add(192)); - write_quarter_rgba(r_hi, g_hi, b_hi, alpha_u16, 3, dst.add(224)); + write_quarter_rgba(r_lo, g_lo, b_lo, a0, 0, dst); + write_quarter_rgba(r_lo, g_lo, b_lo, a1, 1, dst.add(32)); + write_quarter_rgba(r_lo, g_lo, b_lo, a2, 2, dst.add(64)); + write_quarter_rgba(r_lo, g_lo, b_lo, a3, 3, dst.add(96)); + write_quarter_rgba(r_hi, g_hi, b_hi, a4, 0, dst.add(128)); + write_quarter_rgba(r_hi, g_hi, b_hi, a5, 1, dst.add(160)); + write_quarter_rgba(r_hi, g_hi, b_hi, a6, 2, dst.add(192)); + write_quarter_rgba(r_hi, g_hi, b_hi, a7, 3, dst.add(224)); } else { let dst = out.as_mut_ptr().add(x * 3); write_quarter(r_lo, g_lo, b_lo, 0, dst); @@ -859,7 +939,13 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); @@ -3863,8 +3949,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -3887,16 +3973,57 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// AVX-512 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA +/// with the per-pixel alpha element **sourced from `a_src`** +/// (full-range u16, no mask, no shift) instead of being constant +/// `0xFFFF`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "avx512f,avx512bw")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared AVX-512 16-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_32`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_32` with -/// constant alpha `0xFFFF`. +/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_32`. +/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_32` with +/// constant alpha `0xFFFF` (broadcast 128-bit lane). +/// - `ALPHA = true, ALPHA_SRC = true`: 4× `write_rgba_u16_8` with the +/// alpha quarters loaded from `a_src` (full-range u16, no shift). /// /// # Safety /// @@ -3905,23 +4032,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. #[inline] #[target_feature(enable = "avx512f,avx512bw")] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -4044,7 +4180,53 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( // Write 32 pixels via the appropriate 4× 8-pixel helper. if ALPHA { - write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies the + // wrapper passed Some(_), validated by debug_assert above. + // 16-bit alpha is full-range u16 — load 32 lanes (one + // __m512i = 64 bytes), split into four 128-bit quarters + // and inline the 4× write_rgba_u16_8 calls (the standard + // `write_rgba_u16_32` helper broadcasts a single alpha + // 128-bit lane to all 4 quarters, which doesn't fit the + // per-pixel-source-alpha case). + let a_ptr = a_src.as_ref().unwrap_unchecked().as_ptr(); + let a_vec = _mm512_loadu_si512(a_ptr.add(x).cast()); + let a0 = _mm512_extracti32x4_epi32::<0>(a_vec); + let a1 = _mm512_extracti32x4_epi32::<1>(a_vec); + let a2 = _mm512_extracti32x4_epi32::<2>(a_vec); + let a3 = _mm512_extracti32x4_epi32::<3>(a_vec); + let dst = out.as_mut_ptr().add(x * 4); + write_rgba_u16_8( + _mm512_castsi512_si128(r_u16), + _mm512_castsi512_si128(g_u16), + _mm512_castsi512_si128(b_u16), + a0, + dst, + ); + write_rgba_u16_8( + _mm512_extracti32x4_epi32::<1>(r_u16), + _mm512_extracti32x4_epi32::<1>(g_u16), + _mm512_extracti32x4_epi32::<1>(b_u16), + a1, + dst.add(32), + ); + write_rgba_u16_8( + _mm512_extracti32x4_epi32::<2>(r_u16), + _mm512_extracti32x4_epi32::<2>(g_u16), + _mm512_extracti32x4_epi32::<2>(b_u16), + a2, + dst.add(64), + ); + write_rgba_u16_8( + _mm512_extracti32x4_epi32::<3>(r_u16), + _mm512_extracti32x4_epi32::<3>(g_u16), + _mm512_extracti32x4_epi32::<3>(b_u16), + a3, + dst.add(96), + ); + } else { + write_rgba_u16_32(r_u16, g_u16, b_u16, alpha_u16, out.as_mut_ptr().add(x * 4)); + } } else { write_rgb_u16_32(r_u16, g_u16, b_u16, out.as_mut_ptr().add(x * 3)); } @@ -4058,7 +4240,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( let tail_v = &v_half[x / 2..width / 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - if ALPHA { + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). + let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p16_to_rgba_u16_row( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index e1a25967..b3d6af0e 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -3116,3 +3116,148 @@ fn avx512_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { ); } } + +// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ---- + +fn check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +fn check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = p16_plane_avx512(width, 37); + let u = p16_plane_avx512(width / 2, 53); + let v = p16_plane_avx512(width / 2, 71); + let a_src = p16_plane_avx512(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "AVX-512 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn avx512_yuva420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(64, m, full, 89); + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(64, m, full, 89); + } + } +} + +#[test] +fn avx512_yuva420p_n_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [64usize, 66, 78, 94, 1920, 1922] { + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89); + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn avx512_yuva420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(32, m, full, 89); + } + } +} + +#[test] +fn avx512_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() { + if !std::arch::is_x86_feature_detected!("avx512bw") { + return; + } + for w in [32usize, 34, 46, 62, 78, 94, 1920, 1922] { + check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89); + } + for seed in [13usize, 41, 127, 211] { + check_yuv420p16_u16_avx512_rgba_with_alpha_src_equivalence(32, ColorMatrix::Bt601, true, seed); + } +} diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index 37da1bb7..a8935652 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -1044,8 +1044,8 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -1069,16 +1069,57 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p_n_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 YUVA 4:2:0 high-bit-depth → **native-depth `u16`** packed +/// RGBA with the per-pixel alpha element **sourced from `a_src`** +/// (already at the source's native bit depth — masked to BITS, no +/// shift) instead of being the opaque maximum `(1 << BITS) - 1`. +/// +/// Thin wrapper over [`yuv_420p_n_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p_n_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "sse4.1")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p_n_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared SSE4.1 high-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples via `write_rgb_u16_8`; -/// `ALPHA = true` writes RGBA quads via `write_rgba_u16_8` with -/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with +/// constant alpha `(1 << BITS) - 1`. +/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the +/// alpha lane loaded from `a_src` and masked to BITS. /// /// # Safety /// @@ -1087,25 +1128,38 @@ pub(crate) unsafe fn yuv_420p_n_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. -/// 4. `BITS` ∈ `{9, 10, 12, 14}`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. +/// 5. `BITS` ∈ `{9, 10, 12, 14}`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row< + const BITS: u32, + const ALPHA: bool, + const ALPHA_SRC: bool, +>( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { const { assert!(BITS == 9 || BITS == 10 || BITS == 12 || BITS == 14) }; + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::(full_range); @@ -1181,14 +1235,20 @@ pub(crate) unsafe fn yuv_420p_n_to_rgb_or_rgba_u16_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p_n_to_rgba_u16_row::( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); @@ -3255,8 +3321,8 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgb_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgb_out, width, matrix, full_range, ); } } @@ -3279,15 +3345,56 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( full_range: bool, ) { unsafe { - yuv_420p16_to_rgb_or_rgba_u16_row::( - y, u_half, v_half, rgba_out, width, matrix, full_range, + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, u_half, v_half, None, rgba_out, width, matrix, full_range, + ); + } +} + +/// SSE4.1 16-bit YUVA 4:2:0 → **native-depth `u16`** packed RGBA with +/// the per-pixel alpha element **sourced from `a_src`** (full-range +/// u16, no mask, no shift) instead of being constant `0xFFFF`. +/// +/// Thin wrapper over [`yuv_420p16_to_rgb_or_rgba_u16_row`] with +/// `ALPHA = true, ALPHA_SRC = true`. +/// +/// # Safety +/// +/// Same as [`yuv_420p16_to_rgba_u16_row`] plus `a_src.len() >= width`. +#[inline] +#[target_feature(enable = "sse4.1")] +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgba_u16_with_alpha_src_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a_src: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, +) { + // SAFETY: caller obligations forwarded to the shared impl. + unsafe { + yuv_420p16_to_rgb_or_rgba_u16_row::( + y, + u_half, + v_half, + Some(a_src), + rgba_out, + width, + matrix, + full_range, ); } } /// Shared SSE4.1 16-bit YUV 4:2:0 → native-depth `u16` kernel. -/// `ALPHA = false` writes RGB triples; `ALPHA = true` writes RGBA -/// quads with constant alpha `0xFFFF`. +/// - `ALPHA = false, ALPHA_SRC = false`: `write_rgb_u16_8`. +/// - `ALPHA = true, ALPHA_SRC = false`: `write_rgba_u16_8` with +/// constant alpha `0xFFFF`. +/// - `ALPHA = true, ALPHA_SRC = true`: `write_rgba_u16_8` with the +/// alpha lane loaded from `a_src` (full-range u16). /// /// # Safety /// @@ -3296,23 +3403,32 @@ pub(crate) unsafe fn yuv_420p16_to_rgba_u16_row( /// 3. `y.len() >= width`, `u_half.len() >= width / 2`, /// `v_half.len() >= width / 2`, /// `out.len() >= width * if ALPHA { 4 } else { 3 }`. +/// 4. When `ALPHA_SRC = true`: `a_src` must be `Some(_)` and +/// `a_src.unwrap().len() >= width`. #[inline] #[target_feature(enable = "sse4.1")] -pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( +#[allow(clippy::too_many_arguments)] +pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( y: &[u16], u_half: &[u16], v_half: &[u16], + a_src: Option<&[u16]>, out: &mut [u16], width: usize, matrix: ColorMatrix, full_range: bool, ) { + // Source alpha requires RGBA output. + const { assert!(!ALPHA_SRC || ALPHA) }; let bpp: usize = if ALPHA { 4 } else { 3 }; debug_assert_eq!(width & 1, 0); debug_assert!(y.len() >= width); debug_assert!(u_half.len() >= width / 2); debug_assert!(v_half.len() >= width / 2); debug_assert!(out.len() >= width * bpp); + if ALPHA_SRC { + debug_assert!(a_src.as_ref().is_some_and(|s| s.len() >= width)); + } let coeffs = scalar::Coefficients::for_matrix(matrix); let (y_off, y_scale, c_scale) = scalar::range_params_n::<16, 16>(full_range); @@ -3430,13 +3546,16 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( ); if ALPHA { - write_rgba_u16_8( - r_lo_u16, - g_lo_u16, - b_lo_u16, - alpha_u16, - out.as_mut_ptr().add(x * 4), - ); + let a_v = if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies the + // wrapper passed Some(_), validated by debug_assert above. + // 16-bit alpha is full-range u16 — load 8 lanes (16 bytes) + // directly, no mask or shift. + _mm_loadu_si128(a_src.as_ref().unwrap_unchecked().as_ptr().add(x).cast()) + } else { + alpha_u16 + }; + write_rgba_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, a_v, out.as_mut_ptr().add(x * 4)); } else { write_rgb_u16_8(r_lo_u16, g_lo_u16, b_lo_u16, out.as_mut_ptr().add(x * 3)); } @@ -3449,7 +3568,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row( let tail_v = &v_half[x / 2..width / 2]; let tail_out = &mut out[x * bpp..width * bpp]; let tail_w = width - x; - if ALPHA { + if ALPHA_SRC { + // SAFETY (const-checked): ALPHA_SRC = true implies Some(_). + let tail_a = &a_src.as_ref().unwrap_unchecked()[x..width]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + tail_y, tail_u, tail_v, tail_a, tail_out, tail_w, matrix, full_range, + ); + } else if ALPHA { scalar::yuv_420p16_to_rgba_u16_row( tail_y, tail_u, tail_v, tail_out, tail_w, matrix, full_range, ); diff --git a/src/row/arch/x86_sse41/tests.rs b/src/row/arch/x86_sse41/tests.rs index 1fe7dd2a..9afc1363 100644 --- a/src/row/arch/x86_sse41/tests.rs +++ b/src/row/arch/x86_sse41/tests.rs @@ -3108,3 +3108,148 @@ fn sse41_yuva444p_n_rgba_u16_matches_scalar_all_bits_widths() { ); } } + +// ---- YUVA 4:2:0 native-depth `u16` RGBA equivalence (Ship 8b‑2c) ---- + +fn check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = planar_n_plane::(width, 37); + let u = planar_n_plane::(width / 2, 53); + let v = planar_n_plane::(width / 2, 71); + let a_src = planar_n_plane::(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p_n_to_rgba_u16_with_alpha_src_row::( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuva420p<{BITS}> → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +fn check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence( + width: usize, + matrix: ColorMatrix, + full_range: bool, + alpha_seed: usize, +) { + let y = p16_plane(width, 37); + let u = p16_plane(width / 2, 53); + let v = p16_plane(width / 2, 71); + let a_src = p16_plane(width, alpha_seed); + let mut rgba_scalar = std::vec![0u16; width * 4]; + let mut rgba_simd = std::vec![0u16; width * 4]; + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_scalar, + width, + matrix, + full_range, + ); + unsafe { + yuv_420p16_to_rgba_u16_with_alpha_src_row( + &y, + &u, + &v, + &a_src, + &mut rgba_simd, + width, + matrix, + full_range, + ); + } + assert_eq!( + rgba_scalar, rgba_simd, + "SSE4.1 Yuva420p16 → RGBA u16 diverges (width={width}, matrix={matrix:?}, full_range={full_range}, alpha_seed={alpha_seed})" + ); +} + +#[test] +fn sse41_yuva420p_n_rgba_u16_matches_scalar_all_bits() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(16, m, full, 89); + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(16, m, full, 89); + } + } +} + +#[test] +fn sse41_yuva420p_n_rgba_u16_matches_scalar_widths() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89); + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + } +} + +#[test] +fn sse41_yuva420p16_rgba_u16_matches_scalar_all_matrices() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for m in [ + ColorMatrix::Bt601, + ColorMatrix::Bt709, + ColorMatrix::Bt2020Ncl, + ColorMatrix::Smpte240m, + ColorMatrix::Fcc, + ColorMatrix::YCgCo, + ] { + for full in [true, false] { + check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(16, m, full, 89); + } + } +} + +#[test] +fn sse41_yuva420p16_rgba_u16_matches_scalar_widths_and_alpha() { + if !std::arch::is_x86_feature_detected!("sse4.1") { + return; + } + for w in [16usize, 18, 30, 34, 1920, 1922] { + check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(w, ColorMatrix::Bt709, false, 89); + } + for seed in [13usize, 41, 127, 211] { + check_yuv420p16_u16_sse41_rgba_with_alpha_src_equivalence(16, ColorMatrix::Bt601, true, seed); + } +} diff --git a/src/row/mod.rs b/src/row/mod.rs index bea53e5e..83bf088b 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -5391,10 +5391,8 @@ pub fn yuva420p9_to_rgba_row( /// source's native bit depth) instead of being the opaque maximum /// `511`. /// -/// # ⚠ Scalar-only as of Ship 8b‑2a -/// -/// This dispatcher routes to scalar regardless of `use_simd`. SIMD -/// wiring lands in Ship 8b‑2c. +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuva420p9_to_rgba_u16_row( @@ -5416,7 +5414,63 @@ pub fn yuva420p9_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); @@ -5517,10 +5571,8 @@ pub fn yuva420p10_to_rgba_row( /// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the /// per-pixel alpha element is **sourced from `a`** at native depth. /// -/// # ⚠ Scalar-only as of Ship 8b‑2a -/// -/// This dispatcher routes to scalar regardless of `use_simd`. SIMD -/// wiring lands in Ship 8b‑2c. +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuva420p10_to_rgba_u16_row( @@ -5542,7 +5594,63 @@ pub fn yuva420p10_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); @@ -5642,10 +5750,8 @@ pub fn yuva420p16_to_rgba_row( /// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel /// alpha element is **sourced from `a`** at native depth (no shift). /// -/// # ⚠ Scalar-only as of Ship 8b‑2a -/// -/// This dispatcher routes to scalar regardless of `use_simd`. SIMD -/// wiring lands in Ship 8b‑2c. +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] pub fn yuva420p16_to_rgba_u16_row( @@ -5667,7 +5773,63 @@ pub fn yuva420p16_to_rgba_u16_row( assert!(a.len() >= width, "a row too short"); assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - let _ = use_simd; // SIMD per-arch routes land in Ship 8b-2c PR. + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( y, u_half, v_half, a, rgba_out, width, matrix, full_range, ); From df00980da9fb3374a34647c1e42a87eb4d534b04 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:05:04 +1200 Subject: [PATCH 2/6] refactor(row): split mod.rs into dispatch/* submodules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `src/row/mod.rs` had grown to 7276 lines, dominating the entire row crate-private surface. Split the public dispatchers into 7 sibling files under `src/row/dispatch/` grouped by source-format family for readability: - `dispatch/yuv420.rs` (~2700 lines): yuv_420 (8-bit) + yuv420p9/10/12/14/16 + p010/p012/p016 — RGB + RGBA - `dispatch/yuv444.rs` (~1330 lines): yuv_444 (8-bit) + yuv444p9/10/12/14/16 (BITS-generic helpers + per-bit-depth wrappers) — RGB + RGBA - `dispatch/nv.rs` (~630 lines): NV12 / NV21 / NV24 / NV42 — RGB + RGBA - `dispatch/pn.rs` (~800 lines): P410 / P412 / P416 (semi-planar 4:4:4) — RGB + RGBA - `dispatch/yuva.rs` (~845 lines): Yuva444p10 + the Yuva420p family (8-bit + 9 / 10 / 16-bit) — RGBA + u16 RGBA - `dispatch/rgb_ops.rs` (~170 lines): rgb_to_hsv_row, bgr_to_rgb_row, rgb_to_bgr_row - `dispatch/bayer.rs` (~160 lines): Bayer dispatchers `mod.rs` keeps: - Module-level doc + `pub(crate) mod arch / scalar` - `mod dispatch;` + `pub use dispatch::*::*` re-exports (the public API at `crate::row::*` is unchanged) - Shared dispatcher helpers (`rgb_row_bytes`, `rgba_row_bytes`, `rgb_row_elems`, `rgba_row_elems`, `uv_full_row_elems`, `assert_color_transform_well_formed`, `MAX_FUSED_TRANSFORM_ABS`) — bumped from `fn` (private) to `pub(crate)` so dispatch submodules can call them. - Runtime CPU feature detection (`neon_available`, `avx2_available`, `sse41_available`, `avx512_available`, `simd128_available`) — also bumped to `pub(crate)`. - Inline tests (`mod overflow_tests`, `mod bayer_dispatcher_tests`). mod.rs reduces from 7276 lines to 770 lines. The dispatcher function bodies were extracted byte-for-byte via `sed -n` — no semantic changes. The only edits were swapping `fn` → `pub(crate) fn` on shared helpers, adding per-file `use crate::row::*` imports for `scalar`, `arch`, helpers, and the CPU-detection helpers, plus the `pub use dispatch::*::*` re-exports in `mod.rs`. Verified across aarch64-apple-darwin, x86_64-unknown-freebsd, and wasm32-unknown-unknown: - `cargo check --lib --tests`: clean - `RUSTFLAGS=-Dwarnings cargo clippy --lib --tests`: clean - `cargo test --lib` (host): 629 passed (same as before) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/bayer.rs | 162 + src/row/dispatch/mod.rs | 17 + src/row/dispatch/nv.rs | 629 ++++ src/row/dispatch/pn.rs | 796 +++++ src/row/dispatch/rgb_ops.rs | 171 + src/row/dispatch/yuv420.rs | 2698 ++++++++++++++ src/row/dispatch/yuv444.rs | 1333 +++++++ src/row/dispatch/yuva.rs | 845 +++++ src/row/mod.rs | 6608 +---------------------------------- 9 files changed, 6702 insertions(+), 6557 deletions(-) create mode 100644 src/row/dispatch/bayer.rs create mode 100644 src/row/dispatch/mod.rs create mode 100644 src/row/dispatch/nv.rs create mode 100644 src/row/dispatch/pn.rs create mode 100644 src/row/dispatch/rgb_ops.rs create mode 100644 src/row/dispatch/yuv420.rs create mode 100644 src/row/dispatch/yuv444.rs create mode 100644 src/row/dispatch/yuva.rs diff --git a/src/row/dispatch/bayer.rs b/src/row/dispatch/bayer.rs new file mode 100644 index 00000000..4f45857f --- /dev/null +++ b/src/row/dispatch/bayer.rs @@ -0,0 +1,162 @@ +//! Bayer dispatchers (`bayer_to_rgb_row`, `bayer16_to_rgb_row`, +//! `bayer16_to_rgb_u16_row`) extracted from `row::mod` for organization. +//! +//! `use_simd` is currently a no-op for all Bayer paths — they route to +//! scalar regardless. Per-arch SIMD backends ship in a follow-up; the +//! parameter is wired through so callers don't have to touch their +//! call sites when SIMD lands. + +use crate::row::scalar; +use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems}; + +/// Converts one row of an 8-bit Bayer plane to packed RGB. +/// +/// Dispatches to the best available backend for the current target. +/// See [`scalar::bayer_to_rgb_row`] for the full semantic specification +/// (bilinear demosaic geometry, edge handling, output layout). +/// +/// `above` / `mid` / `below` are row-aligned slices into the source +/// Bayer plane via the **mirror-by-2** boundary contract: at the +/// top edge the caller supplies `above = mid_row(1)`, at the bottom +/// edge `below = mid_row(h - 2)`; replicate fallback only when +/// `height < 2`. See [`crate::raw::BayerRow::above`] for the full +/// rationale (CFA-parity preservation across boundaries). +/// `above` / `mid` / `below` must all be the same length — that +/// length is the row's pixel width. +/// +/// `m` is the precomputed `CCM · diag(wb)` 3×3 transform. Every +/// element must be finite (not NaN, not ±∞); the dispatcher +/// asserts this at the boundary so future unsafe SIMD kernels can +/// trust the contract. +/// +/// `rgb_out` must have at least `3 * mid.len()` bytes. +/// +/// **`use_simd` is currently a no-op.** All Bayer paths run the +/// scalar reference today; per-arch SIMD backends (NEON / SSE4.1 / +/// AVX2 / AVX-512 / wasm simd128) ship in a follow-up. The +/// parameter is wired through `MixedSinker` and the public +/// dispatchers now so callers don't have to touch their call sites +/// when SIMD lands. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn bayer_to_rgb_row( + above: &[u8], + mid: &[u8], + below: &[u8], + row_parity: u32, + pattern: crate::raw::BayerPattern, + demosaic: crate::raw::BayerDemosaic, + m: &[[f32; 3]; 3], + rgb_out: &mut [u8], + _use_simd: bool, +) { + // Release-mode preflight: future unsafe SIMD backends will rely on + // these invariants for bounds-free pointer arithmetic, so we + // validate here rather than only via `debug_assert!` inside the + // scalar kernel. Same pattern as `yuv_420_to_rgb_row`. + let width = mid.len(); + assert_eq!(above.len(), width, "above row length must match mid"); + assert_eq!(below.len(), width, "below row length must match mid"); + let rgb_min = rgb_row_bytes(width); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + assert_color_transform_well_formed(m); + + scalar::bayer_to_rgb_row(above, mid, below, row_parity, pattern, demosaic, m, rgb_out); +} + +/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer +/// plane to packed `u8` RGB. +/// +/// `BITS` ∈ {10, 12, 14, 16}; samples are low-packed `u16` (active +/// values in the low `BITS` bits, range `[0, (1 << BITS) - 1]`). +/// Direct row-API callers are responsible for upholding the +/// low-packed contract; samples whose value exceeds +/// `(1 << BITS) - 1` produce defined-but-saturated output (no +/// panic, no UB). The walker +/// [`crate::raw::bayer16_to`] never sees out-of-range input +/// because [`crate::frame::BayerFrame16::try_new`] validates every +/// active sample at frame-construction time. +/// +/// `m` is the unscaled `CCM · diag(wb)` — the kernel bakes the +/// input→u8 rescale (`255 / ((1 << BITS) - 1)`) at output time. +/// `above` / `mid` / `below` must all be the same length; +/// `rgb_out` must have at least `3 * mid.len()` bytes. +/// +/// **`use_simd` is currently a no-op** (see +/// [`bayer_to_rgb_row`] for the deferred-SIMD note). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn bayer16_to_rgb_row( + above: &[u16], + mid: &[u16], + below: &[u16], + row_parity: u32, + pattern: crate::raw::BayerPattern, + demosaic: crate::raw::BayerDemosaic, + m: &[[f32; 3]; 3], + rgb_out: &mut [u8], + _use_simd: bool, +) { + const { + assert!( + BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16, + "bayer16_to_rgb_row: BITS must be 10, 12, 14, or 16" + ) + }; + let width = mid.len(); + assert_eq!(above.len(), width, "above row length must match mid"); + assert_eq!(below.len(), width, "below row length must match mid"); + let rgb_min = rgb_row_bytes(width); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + assert_color_transform_well_formed(m); + + scalar::bayer16_to_rgb_row::(above, mid, below, row_parity, pattern, demosaic, m, rgb_out); +} + +/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer +/// plane to packed `u16` RGB (also low-packed at `BITS`). +/// +/// `BITS` ∈ {10, 12, 14, 16}. Input and output share the same +/// low-packed range `[0, (1 << BITS) - 1]` per channel — no +/// rescale, just clamp. `above` / `mid` / `below` must all be the +/// same length; `rgb_out` must have at least `3 * mid.len()` `u16` +/// elements. +/// +/// Direct row-API callers are responsible for upholding the +/// low-packed contract — see [`bayer16_to_rgb_row`] for the +/// full rationale on the safe path +/// ([`crate::frame::BayerFrame16::try_new`] + [`crate::raw::bayer16_to`]) +/// vs. the direct row API. +/// +/// **`use_simd` is currently a no-op** (see +/// [`bayer_to_rgb_row`] for the deferred-SIMD note). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn bayer16_to_rgb_u16_row( + above: &[u16], + mid: &[u16], + below: &[u16], + row_parity: u32, + pattern: crate::raw::BayerPattern, + demosaic: crate::raw::BayerDemosaic, + m: &[[f32; 3]; 3], + rgb_out: &mut [u16], + _use_simd: bool, +) { + const { + assert!( + BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16, + "bayer16_to_rgb_u16_row: BITS must be 10, 12, 14, or 16" + ) + }; + let width = mid.len(); + assert_eq!(above.len(), width, "above row length must match mid"); + assert_eq!(below.len(), width, "below row length must match mid"); + let rgb_min = rgb_row_elems(width); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + assert_color_transform_well_formed(m); + + scalar::bayer16_to_rgb_u16_row::( + above, mid, below, row_parity, pattern, demosaic, m, rgb_out, + ); +} diff --git a/src/row/dispatch/mod.rs b/src/row/dispatch/mod.rs new file mode 100644 index 00000000..864123bd --- /dev/null +++ b/src/row/dispatch/mod.rs @@ -0,0 +1,17 @@ +//! Public row-dispatcher submodules. The dispatchers were extracted +//! from `row::mod` here so the parent module stays focused on +//! shared helpers, runtime CPU feature detection, and crate-private +//! `arch` / `scalar` glue. +//! +//! Submodules are gated `pub(super) mod` and re-exported via +//! `pub use` in `row::mod`, so the public API still appears at +//! `crate::row::*` (e.g. `crate::row::yuv_420_to_rgb_row`). Callers +//! see no API change from the split. + +pub(super) mod bayer; +pub(super) mod nv; +pub(super) mod pn; +pub(super) mod rgb_ops; +pub(super) mod yuv420; +pub(super) mod yuv444; +pub(super) mod yuva; diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs new file mode 100644 index 00000000..b342e6e4 --- /dev/null +++ b/src/row/dispatch/nv.rs @@ -0,0 +1,629 @@ +//! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and +//! RGBA outputs) extracted from `row::mod` for organization. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +/// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB. +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only +/// difference is UV source — NV12 delivers U and V interleaved in a +/// single `width`‑byte row (`U0, V0, U1, V1, …`). See +/// `scalar::nv12_to_rgb_row` for the reference implementation. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv12_to_rgb_row( + y: &[u8], + uv_half: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary (see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 3` multiplication). + assert_eq!(width & 1, 0, "NV12 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present on this + // CPU. Bounds / parity invariants are the caller's obligation + // (checked above). + unsafe { + arch::neon::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present. + unsafe { + arch::x86_avx2::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + unsafe { + arch::x86_sse41::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: `simd128_available()` verified simd128 is on at + // compile time (WASM has no runtime CPU detection). + unsafe { + arch::wasm_simd128::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to +/// packed RGB. +/// +/// Same numerical contract as [`nv12_to_rgb_row`]; the only +/// difference is chroma byte order — NV21 stores `V0, U0, V1, U1, …` +/// instead of NV12's `U0, V0, U1, V1, …`. See `scalar::nv21_to_rgb_row` +/// for the reference implementation. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv21_to_rgb_row( + y: &[u8], + vu_half: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary. + assert_eq!(width & 1, 0, "NV21 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(vu_half.len() >= width, "vu_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA** +/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel — sources without an alpha plane +/// produce opaque output). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv12_to_rgba_row( + y: &[u8], + uv_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary — see + // [`yuv_420_to_rgba_row`] for rationale, including the checked + // `width × 4` multiplication via [`rgba_row_bytes`]. + assert_eq!(width & 1, 0, "NV12 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to +/// packed **RGBA** (8-bit). Same numerical contract as +/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv21_to_rgba_row( + y: &[u8], + vu_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "NV21 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(vu_half.len() >= width, "vu_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed +/// RGB. Dispatches to the best available SIMD backend for the current +/// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling +/// back to scalar when no backend is available. +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference +/// from NV12 is 4:4:4 chroma — one UV pair per Y pixel, no chroma +/// upsampling, and no width parity constraint. See +/// `scalar::nv24_to_rgb_row` for the reference implementation. +/// +/// `use_simd = false` forces the scalar reference path, bypassing any +/// SIMD backend. Benchmarks can flip this to compare scalar vs SIMD +/// directly on the same input; production code should pass `true`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv24_to_rgb_row( + y: &[u8], + uv: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + // NV24 chroma carries one UV pair per pixel = `2 * width` bytes. + // Use `checked_mul` — on 32-bit targets, `2 * width` can overflow + // `usize` at extreme widths and silently short-circuit the length + // check before entering unsafe SIMD paths. + let uv_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(uv.len() >= uv_min, "uv row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); +} + +/// Converts one row of NV42 (semi‑planar 4:4:4, VU‑ordered) to packed +/// RGB. Same as [`nv24_to_rgb_row`] but with swapped chroma byte order. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv42_to_rgb_row( + y: &[u8], + vu: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + let vu_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(vu.len() >= vu_min, "vu row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); +} + +/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed +/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`]; +/// alpha defaults to `0xFF` (opaque). +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv24_to_rgba_row( + y: &[u8], + uv: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(uv.len() >= uv_min, "uv row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); +} + +/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed +/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped +/// chroma byte order. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn nv42_to_rgba_row( + y: &[u8], + vu: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let vu_min = match width.checked_mul(2) { + Some(n) => n, + None => panic!("width ({width}) × 2 overflows usize"), + }; + assert!(y.len() >= width, "y row too short"); + assert!(vu.len() >= vu_min, "vu row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs new file mode 100644 index 00000000..f2a143c4 --- /dev/null +++ b/src/row/dispatch/pn.rs @@ -0,0 +1,796 @@ +//! Semi-planar 4:4:4 (P410 / P412 / P416) dispatchers — RGB + RGBA +//! for both 8-bit and native-depth `u16` outputs. Extracted from +//! `row::mod` for organization. +//! +//! Internal `pub(crate)` helpers `p_n_444_to_rgb_row` / +//! `p_n_444_to_rgb_u16_row` provide the BITS-generic dispatch shared +//! by P410/P412 (`BITS = 10/12`); P416 has its own dedicated kernels +//! (full u16 range; the BITS-generic path doesn't apply). +//! +//! P010 / P012 / P016 (semi-planar 4:2:0) live in `dispatch::yuv420` +//! since they share the 4:2:0 chroma layout with the planar +//! yuv420p9/10/12/14/16 family. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, uv_full_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +// ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB -------------------- +// +// Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width +// interleaved UV (one `U, V` pair per pixel = `2 * width` u16 elements +// per row). BITS ∈ {10, 12} run on the const-generic Q15 i32 family; +// BITS = 16 runs on the dedicated parallel i64-chroma family +// (chroma multiply-add overflows i32 at 16-bit u16 output). + +/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB +/// dispatcher. Const-generic over `BITS`; dispatches to the best +/// available backend (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), +/// falling back to scalar when no SIMD backend is available or +/// `use_simd` is false. +/// +/// Crate-private — public consumers go through the per-format +/// dispatchers (`p410_to_rgb_row`, `p412_to_rgb_row`) which fix +/// `BITS` to a literal. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn p_n_444_to_rgb_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX-512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile-time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16** +/// RGB dispatcher. Output is low-bit-packed (active bits in low +/// `BITS` of each `u16`). Same dispatch shape as +/// [`p_n_444_to_rgb_row`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn p_n_444_to_rgb_u16_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher. +/// Y stays on i32 (output-range scaling keeps `coeff × u_d` within +/// i32 for u8 output); chroma multiply-add also stays on i32. +/// Dedicated entry point because the Q15 const-generic family is +/// pinned to BITS ∈ {10, 12}. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgb_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma +/// multiply-add runs on i64 (overflow safety at 16-bit u16 output); +/// see scalar reference for the rationale. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgb_u16_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); +} + +/// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgb_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p_n_444_to_rgb_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); +} + +/// P410 → native-depth u16 RGB (10-bit low-packed output). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgb_u16_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p_n_444_to_rgb_u16_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); +} + +/// P412 → packed u8 RGB. Thin wrapper at `BITS = 12`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgb_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p_n_444_to_rgb_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); +} + +/// P412 → native-depth u16 RGB (12-bit low-packed output). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgb_u16_row( + y: &[u16], + uv_full: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); +} + +/// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P410 → **native-depth `u16`** packed **RGBA** — output is +/// low-bit-packed (`[0, 1023]`); alpha element is `1023`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p410_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P412 → **native-depth `u16`** packed **RGBA** — output is +/// low-bit-packed (`[0, 4095]`); alpha element is `4095`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p412_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA** +/// (`R, G, B, 0xFF`). Routes through the dedicated 16-bit scalar +/// kernel (`scalar::p_n_444_16_to_rgba_row`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); +} + +/// P416 → **native-depth `u16`** packed **RGBA** — full-range output +/// `[0, 65535]`; alpha element is `0xFFFF`. Routes through the +/// dedicated 16-bit u16-output scalar kernel +/// (`scalar::p_n_444_16_to_rgba_u16_row`) — i64 chroma multiply. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p416_to_rgba_u16_row( + y: &[u16], + uv_full: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + let uv_min = uv_full_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_full.len() >= uv_min, "uv_full row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); +} + diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs new file mode 100644 index 00000000..c51257d8 --- /dev/null +++ b/src/row/dispatch/rgb_ops.rs @@ -0,0 +1,171 @@ +//! RGB→HSV and BGR↔RGB swap dispatchers extracted from `row::mod` for +//! organization. All three route through the standard +//! `cfg_select!` per-arch block; `use_simd = false` forces scalar. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; + +/// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit +/// encoding). See `scalar::rgb_to_hsv_row` for semantics. +/// +/// `use_simd = false` forces the scalar reference path, bypassing any +/// SIMD backend (same semantics as `yuv_420_to_rgb_row`). +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb_to_hsv_row( + rgb: &[u8], + h_out: &mut [u8], + s_out: &mut [u8], + v_out: &mut [u8], + width: usize, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary (see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 3` multiplication). + let rgb_min = rgb_row_bytes(width); + assert!(rgb.len() >= rgb_min, "rgb row too short"); + assert!(h_out.len() >= width, "h_out row too short"); + assert!(s_out.len() >= width, "s_out row too short"); + assert!(v_out.len() >= width, "v_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); + } + return; + } + }, + _ => { + // Targets without a SIMD HSV backend fall through to scalar. + } + } + } + + scalar::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); +} + +/// Rewrites a row of packed BGR to packed RGB by swapping the outer +/// two channels (byte 0 ↔ byte 2) of every triple. `input` and +/// `output` must not alias. +/// +/// The underlying transformation is self‑inverse, so +/// [`rgb_to_bgr_row`] shares the same implementation — use whichever +/// name reads more naturally at the call site. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn bgr_to_rgb_row(bgr: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { + swap_rb_channels_row(bgr, rgb_out, width, use_simd); +} + +/// Rewrites a row of packed RGB to packed BGR by swapping the outer +/// two channels. See [`bgr_to_rgb_row`] — this is an alias that reads +/// more naturally for the opposite direction. +#[cfg_attr(not(tarpaulin), inline(always))] +pub fn rgb_to_bgr_row(rgb: &[u8], bgr_out: &mut [u8], width: usize, use_simd: bool) { + swap_rb_channels_row(rgb, bgr_out, width, use_simd); +} + +/// Shared dispatcher behind `bgr_to_rgb_row` / `rgb_to_bgr_row`. +#[cfg_attr(not(tarpaulin), inline(always))] +fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd: bool) { + // Runtime asserts at the dispatcher boundary (see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 3` multiplication). + let rgb_min = rgb_row_bytes(width); + assert!(input.len() >= rgb_min, "input row too short"); + assert!(output.len() >= rgb_min, "output row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::bgr_rgb_swap_row(input, output, width); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::bgr_rgb_swap_row(input, output, width); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 just verified. + unsafe { + arch::x86_avx2::bgr_rgb_swap_row(input, output, width); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 just verified. + unsafe { + arch::x86_sse41::bgr_rgb_swap_row(input, output, width); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::bgr_rgb_swap_row(input, output, width); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::bgr_rgb_swap_row(input, output, width); +} diff --git a/src/row/dispatch/yuv420.rs b/src/row/dispatch/yuv420.rs new file mode 100644 index 00000000..8f34dca1 --- /dev/null +++ b/src/row/dispatch/yuv420.rs @@ -0,0 +1,2698 @@ +//! YUV 4:2:0 dispatchers (planar and P010/P012/P016 semi-planar) — +//! 8-bit YUV → RGB/RGBA, 9/10/12/14/16-bit planar yuv420p_n RGB+RGBA, +//! P010/P012/P016 semi-planar RGB+RGBA. Extracted from `row::mod` for +//! organization. +//! +//! All dispatchers route through the standard `cfg_select!` per-arch +//! block; `use_simd = false` forces scalar. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +/// Converts one row of 4:2:0 YUV to packed RGB. +/// +/// Dispatches to the best available backend for the current target. +/// See `scalar::yuv_420_to_rgb_row` for the full semantic +/// specification (range handling, matrix definitions, output layout). +/// +/// `use_simd = false` forces the scalar reference path, bypassing any +/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD +/// directly on the same input; production code should pass `true`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_420_to_rgb_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary. The unsafe SIMD + // kernels below rely on these invariants for bounds‑free pointer + // arithmetic, so we validate in *release* builds too — not just + // under `debug_assert!`. Kernels keep their own `debug_assert!`s as + // internal sanity checks. + // + // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize` + // on 32‑bit targets (wasm32, i686) for extreme widths. Without the + // guard, a wrapped product could admit an undersized `rgb_out` and + // let the scalar loop's `x * 3` indexing or a SIMD kernel's + // pointer arithmetic run off the end. + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present on this + // CPU. Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference); they are checked + // with `debug_assert` in debug builds. + unsafe { + arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + // Bounds / parity invariants are the caller's obligation. + unsafe { + arch::x86_avx512::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present on this + // CPU. Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference); they are checked + // with `debug_assert` in debug builds. + unsafe { + arch::x86_avx2::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + // Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference). + unsafe { + arch::x86_sse41::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below + // SSE4.1) slot in here, each branch guarded by the matching + // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair. + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: `simd128_available()` (compile‑time + // `cfg!(target_feature = "simd128")`) verified that simd128 + // is on. WASM has no runtime detection — the module's SIMD + // support is fixed at produce‑time. Bounds / parity + // invariants are the caller's obligation. + unsafe { + arch::wasm_simd128::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => { + // Targets without a SIMD backend (riscv64, powerpc, …) fall + // through to the scalar path below. + } + } + } + + scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit). +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel — sources without an alpha plane +/// produce opaque output). The first three bytes per pixel are +/// byte-identical to what [`yuv_420_to_rgb_row`] would write. +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the +/// scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary — see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 4` multiplication via [`rgba_row_bytes`]. + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present. + unsafe { + arch::x86_avx2::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + unsafe { + arch::x86_sse41::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time availability verified. + unsafe { + arch::wasm_simd128::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 9 active bits in the low bits of each +/// element. Niche format (AVC High 9 profile only). Reuses the same +/// `yuv_420p_n_to_rgb_row` kernel family as 10/12/14-bit; the +/// only per-call difference is the const-generic `BITS = 9` which +/// fixes the AND-mask to `0x1FF` and the Q15 scale via +/// `range_params_n::<9, 8>`. +/// +/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 10 active bits in the low bits of each +/// element. Output is packed `R, G, B` bytes (`3 * width` bytes), +/// with the conversion clamping to `[0, 255]` — the native‑depth +/// path is [`yuv420p10_to_rgb_u16_row`]. +/// +/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed +/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`, +/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless +/// downstream HDR processing when the consumer expects low‑bit‑packed +/// samples. +/// +/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16` +/// elements, each in `[0, 1023]` with the upper 6 bits zero. +/// +/// This is **not** the FFmpeg `p010` layout — `p010` stores samples +/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers +/// feeding this output into a p010 consumer must shift left by 6 +/// before handing off. +/// +/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ +/// packed — 10 active bits in the high 10 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// This is the HDR hardware‑decode keystone format: VideoToolbox, +/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit +/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic +/// specification. `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P010** to **native‑depth `u16`** packed RGB +/// (10 active bits in the **low** 10 of each output `u16`, matching +/// `yuv420p10le` convention — **not** the P010 high‑bit packing). +/// Callers feeding this output into a P010 consumer must shift left +/// by 6. +/// +/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. +/// `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 12 active bits in the low 12 bits of each +/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed +/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The +/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching +/// `yuv420p12le` convention — upper 4 bits zero). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (14‑bit values in the low 14 of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ +/// packed — 12 active bits in the high 12 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and +/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but +/// `>> 4` instead of `>> 6` at each `u16` load. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** to **native‑depth `u16`** packed RGB +/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed +/// `yuv420p12le` convention, **not** P012's high‑bit packing). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. +/// +/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs +/// on the **i64 chroma** kernel family; see +/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** +/// packed `u16` RGB (full-range output in `[0, 65535]`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to +/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed +/// vs. low-bit-packed distinction (all bits are active). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P016 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed RGB +/// (full-range output in `[0, 65535]`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P016 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); +} +// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- +// +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces +// the scalar reference path on every dispatcher. + +/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p9_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p10_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 10) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p12_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p14_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 12) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// Routes through the dedicated 16-bit scalar kernel +/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient +/// for u8 output even at 16-bit input. `use_simd = false` forces the +/// scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element +/// is `0xFFFF` (opaque maximum at 16-bit). +/// +/// Routes through the dedicated 16-bit u16-output scalar kernel +/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply +/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit +/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. +/// +/// Routes through the dedicated 16-bit P016 scalar kernel +/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar +/// reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed +/// **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. +/// +/// Routes through the dedicated 16-bit u16-output P016 scalar kernel +/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444.rs b/src/row/dispatch/yuv444.rs new file mode 100644 index 00000000..5bc3a960 --- /dev/null +++ b/src/row/dispatch/yuv444.rs @@ -0,0 +1,1333 @@ +//! YUV 4:4:4 dispatchers (planar 8-bit + high-bit 9/10/12/14/16-bit) +//! — RGB + RGBA. Extracted from `row::mod` for organization. +//! +//! Internal `pub(crate)` helpers `yuv_444p_n_to_rgb_row` / +//! `yuv_444p_n_to_rgb_u16_row` provide the BITS-generic dispatch +//! shared by 9/10/12/14-bit; 16-bit gets its own dedicated kernels. +//! +//! All dispatchers route through the standard `cfg_select!` per-arch +//! block; `use_simd = false` forces scalar. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches +/// to the best available SIMD backend for the current target. +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference +/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma +/// planes, no chroma upsampling, no width parity constraint. See +/// `scalar::yuv_444_to_rgb_row` for the reference implementation. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_444_to_rgb_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX-512BW verified. + unsafe { + arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); +} + +/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit). +/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`. +/// `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const +/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best +/// available backend for the current target (NEON / SSE4.1 / AVX2 / +/// AVX-512 / wasm simd128), falling back to scalar when no SIMD +/// backend is available or `use_simd` is false. +/// +/// Crate-private — external callers use the concrete +/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] / +/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a +/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1` +/// truncates to `-1` when cast to `i16` in the SIMD clamp), and +/// matches the [`yuv420p10_to_rgb_row`] family's convention of +/// keeping the `` generic internal. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn yuv_444p_n_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. +/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output. +/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 / +/// AVX-512 / wasm simd128), falling back to scalar when no SIMD +/// backend is available or `use_simd` is false. +/// +/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The +/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a +/// dedicated i64-chroma kernel family. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn yuv_444p_n_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the +/// crate-internal `yuv_444p_n_to_rgb_row::<9>`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the +/// crate-internal `yuv_444p_n_to_rgb_row::<10>`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 12-bit → u8 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 14-bit → u8 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the +/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline +/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range +/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to +/// i64 to avoid i32 overflow at 16-bit limited range. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); +} +// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- +// +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces +// the scalar reference path on every dispatcher. + +/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the +/// per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_444p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha +/// element is `1023`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha +/// element is `4095`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha +/// element is `16383`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit +/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar +/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs new file mode 100644 index 00000000..90399881 --- /dev/null +++ b/src/row/dispatch/yuva.rs @@ -0,0 +1,845 @@ +//! YUVA dispatchers — Yuva444p10 + the Yuva420p family +//! (Yuva420p / Yuva420p9 / Yuva420p10 / Yuva420p16) for both 8-bit +//! RGBA and native-depth `u16` RGBA outputs. Extracted from +//! `row::mod` for organization. + +use crate::row::scalar; +use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +// ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- +// +// Per-row dispatchers for the YUVA source family (currently Yuva444p10 +// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and +// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through +// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers, +// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row` +// dispatchers' patterns. + +/// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit** +/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family +/// that backs [`yuv444p10_to_rgba_row`]; the per-pixel alpha byte is +/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) +/// instead of being constant `0xFF`. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva444p10_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **10-bit** YUVA 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the +/// per-pixel alpha element is **sourced from `a`** (already at the +/// source's native bit depth) instead of being the opaque maximum +/// `1023`. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva444p10_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u, v, a, rgba_out, width, matrix, full_range, + ); +} + +// ---- YUVA 4:2:0 RGBA dispatchers -------------------------------------- +// +// Per-row dispatchers for the YUVA 4:2:0 source family — Yuva420p +// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p16. The u8 RGBA +// dispatchers route through per-arch +// `yuv_420*_to_rgba*_with_alpha_src_row` SIMD wrappers (Ship 8b-2b), +// mirroring the non-alpha sibling dispatchers' `cfg_select!` blocks. +// The native-depth `u16` RGBA dispatchers below remain scalar pending +// Ship 8b-2c. + +/// Converts one row of 8‑bit YUVA 4:2:0 to packed **8‑bit** **RGBA**. +/// R / G / B are produced by the same Q15 i32 8‑bit kernel that backs +/// [`yuv_420_to_rgba_row`]; the per-pixel alpha byte is **sourced +/// from `a`** (one byte per pixel, full-width — alpha is at luma +/// resolution in 4:2:0, only chroma is subsampled). +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv_420_to_rgba_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + a: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **9‑bit** YUVA 4:2:0 to packed **8‑bit** +/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family +/// that backs [`yuv420p9_to_rgba_row`]; the per-pixel alpha byte is +/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`) +/// instead of being constant `0xFF`. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p9_to_rgba_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **9‑bit** YUVA 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the +/// per-pixel alpha element is **sourced from `a`** (already at the +/// source's native bit depth) instead of being the opaque maximum +/// `511`. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **10‑bit** YUVA 4:2:0 to packed **8‑bit** +/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family +/// that backs [`yuv420p10_to_rgba_row`]; the per-pixel alpha byte is +/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) +/// instead of being constant `0xFF`. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p10_to_rgba_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **10‑bit** YUVA 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the +/// per-pixel alpha element is **sourced from `a`** at native depth. +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **16‑bit** YUVA 4:2:0 to packed **8‑bit** +/// **RGBA**. R / G / B are produced by the same i32 kernel that backs +/// [`yuv420p16_to_rgba_row`]; the per-pixel alpha byte is **sourced +/// from `a`** (depth-converted via `a >> 8` to fit `u8`). +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p16_to_rgba_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} + +/// Converts one row of **16‑bit** YUVA 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel +/// alpha element is **sourced from `a`** at native depth (no shift). +/// +/// `use_simd = false` forces the scalar reference path; otherwise +/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuva420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + a: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(a.len() >= width, "a row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( + y, u_half, v_half, a, rgba_out, width, matrix, full_range, + ); +} diff --git a/src/row/mod.rs b/src/row/mod.rs index 83bf088b..97704767 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -30,6398 +30,45 @@ //! //! Dispatcher `cfg_select!` requires Rust 1.95+ (stable, in the core //! prelude — no import needed). The crate's MSRV matches. +//! +//! # Submodule layout +//! +//! Public dispatchers are split across `dispatch::*` submodules by +//! source format family for readability — `yuv420` / `yuv444` / `nv` / +//! `pn` / `yuva` / `rgb_ops` / `bayer`. They are re-exported as +//! `pub use dispatch::*::*` here so the public API stays at +//! `crate::row::*` (e.g. `crate::row::yuv_420_to_rgb_row`). Callers +//! see no API change from the split. -pub(crate) mod arch; -pub(crate) mod scalar; - -// Re-exported only when a caller is compiled. The `MixedSinker` Strategy A -// fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which -// is gated on `feature = "std"` / `feature = "alloc"` (needs `Vec`). Without -// either feature both this re-export and the underlying scalar function would -// be unused, which is a hard error under `cargo clippy -- -D warnings`. -#[cfg(any(feature = "std", feature = "alloc"))] -pub(crate) use scalar::expand_rgb_to_rgba_row; -#[cfg(any(feature = "std", feature = "alloc"))] -pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row; - -use crate::ColorMatrix; - -/// Converts one row of 4:2:0 YUV to packed RGB. -/// -/// Dispatches to the best available backend for the current target. -/// See `scalar::yuv_420_to_rgb_row` for the full semantic -/// specification (range handling, matrix definitions, output layout). -/// -/// `use_simd = false` forces the scalar reference path, bypassing any -/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD -/// directly on the same input; production code should pass `true`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_420_to_rgb_row( - y: &[u8], - u_half: &[u8], - v_half: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary. The unsafe SIMD - // kernels below rely on these invariants for bounds‑free pointer - // arithmetic, so we validate in *release* builds too — not just - // under `debug_assert!`. Kernels keep their own `debug_assert!`s as - // internal sanity checks. - // - // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize` - // on 32‑bit targets (wasm32, i686) for extreme widths. Without the - // guard, a wrapped product could admit an undersized `rgb_out` and - // let the scalar loop's `x * 3` indexing or a SIMD kernel's - // pointer arithmetic run off the end. - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present on this - // CPU. Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference); they are checked - // with `debug_assert` in debug builds. - unsafe { - arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - // Bounds / parity invariants are the caller's obligation. - unsafe { - arch::x86_avx512::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: `avx2_available()` verified AVX2 is present on this - // CPU. Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference); they are checked - // with `debug_assert` in debug builds. - unsafe { - arch::x86_avx2::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: `sse41_available()` verified SSE4.1 is present. - // Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference). - unsafe { - arch::x86_sse41::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below - // SSE4.1) slot in here, each branch guarded by the matching - // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair. - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: `simd128_available()` (compile‑time - // `cfg!(target_feature = "simd128")`) verified that simd128 - // is on. WASM has no runtime detection — the module's SIMD - // support is fixed at produce‑time. Bounds / parity - // invariants are the caller's obligation. - unsafe { - arch::wasm_simd128::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => { - // Targets without a SIMD backend (riscv64, powerpc, …) fall - // through to the scalar path below. - } - } - } - - scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit). -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only -/// differences are the per-pixel stride (4 vs 3) and the alpha byte -/// (`0xFF`, opaque, for every pixel — sources without an alpha plane -/// produce opaque output). The first three bytes per pixel are -/// byte-identical to what [`yuv_420_to_rgb_row`] would write. -/// -/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the -/// scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_420_to_rgba_row( - y: &[u8], - u_half: &[u8], - v_half: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary — see - // [`yuv_420_to_rgb_row`] for rationale, including the checked - // `width × 4` multiplication via [`rgba_row_bytes`]. - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - unsafe { - arch::x86_avx512::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: `avx2_available()` verified AVX2 is present. - unsafe { - arch::x86_avx2::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: `sse41_available()` verified SSE4.1 is present. - unsafe { - arch::x86_sse41::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time availability verified. - unsafe { - arch::wasm_simd128::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB. -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only -/// difference is UV source — NV12 delivers U and V interleaved in a -/// single `width`‑byte row (`U0, V0, U1, V1, …`). See -/// `scalar::nv12_to_rgb_row` for the reference implementation. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv12_to_rgb_row( - y: &[u8], - uv_half: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary (see - // [`yuv_420_to_rgb_row`] for rationale, including the checked - // `width × 3` multiplication). - assert_eq!(width & 1, 0, "NV12 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present on this - // CPU. Bounds / parity invariants are the caller's obligation - // (checked above). - unsafe { - arch::neon::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - unsafe { - arch::x86_avx512::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: `avx2_available()` verified AVX2 is present. - unsafe { - arch::x86_avx2::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: `sse41_available()` verified SSE4.1 is present. - unsafe { - arch::x86_sse41::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: `simd128_available()` verified simd128 is on at - // compile time (WASM has no runtime CPU detection). - unsafe { - arch::wasm_simd128::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::nv12_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to -/// packed RGB. -/// -/// Same numerical contract as [`nv12_to_rgb_row`]; the only -/// difference is chroma byte order — NV21 stores `V0, U0, V1, U1, …` -/// instead of NV12's `U0, V0, U1, V1, …`. See `scalar::nv21_to_rgb_row` -/// for the reference implementation. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv21_to_rgb_row( - y: &[u8], - vu_half: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary. - assert_eq!(width & 1, 0, "NV21 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(vu_half.len() >= width, "vu_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - unsafe { - arch::x86_avx512::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::nv21_to_rgb_row(y, vu_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of NV12 (semi‑planar 4:2:0) to packed **RGBA** -/// (8-bit). Same numerical contract as [`nv12_to_rgb_row`]; the only -/// differences are the per-pixel stride (4 vs 3) and the alpha byte -/// (`0xFF`, opaque, for every pixel — sources without an alpha plane -/// produce opaque output). -/// -/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv12_to_rgba_row( - y: &[u8], - uv_half: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary — see - // [`yuv_420_to_rgba_row`] for rationale, including the checked - // `width × 4` multiplication via [`rgba_row_bytes`]. - assert_eq!(width & 1, 0, "NV12 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::nv12_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of NV21 (semi‑planar 4:2:0, VU-ordered) to -/// packed **RGBA** (8-bit). Same numerical contract as -/// [`nv21_to_rgb_row`]; alpha defaults to `0xFF` (opaque). -/// -/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv21_to_rgba_row( - y: &[u8], - vu_half: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "NV21 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(vu_half.len() >= width, "vu_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::nv21_to_rgba_row(y, vu_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of NV24 (semi‑planar 4:4:4, UV‑ordered) to packed -/// RGB. Dispatches to the best available SIMD backend for the current -/// target (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), falling -/// back to scalar when no backend is available. -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference -/// from NV12 is 4:4:4 chroma — one UV pair per Y pixel, no chroma -/// upsampling, and no width parity constraint. See -/// `scalar::nv24_to_rgb_row` for the reference implementation. -/// -/// `use_simd = false` forces the scalar reference path, bypassing any -/// SIMD backend. Benchmarks can flip this to compare scalar vs SIMD -/// directly on the same input; production code should pass `true`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv24_to_rgb_row( - y: &[u8], - uv: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - // NV24 chroma carries one UV pair per pixel = `2 * width` bytes. - // Use `checked_mul` — on 32-bit targets, `2 * width` can overflow - // `usize` at extreme widths and silently short-circuit the length - // check before entering unsafe SIMD paths. - let uv_min = match width.checked_mul(2) { - Some(n) => n, - None => panic!("width ({width}) × 2 overflows usize"), - }; - assert!(y.len() >= width, "y row too short"); - assert!(uv.len() >= uv_min, "uv row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::nv24_to_rgb_row(y, uv, rgb_out, width, matrix, full_range); -} - -/// Converts one row of NV42 (semi‑planar 4:4:4, VU‑ordered) to packed -/// RGB. Same as [`nv24_to_rgb_row`] but with swapped chroma byte order. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv42_to_rgb_row( - y: &[u8], - vu: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - let vu_min = match width.checked_mul(2) { - Some(n) => n, - None => panic!("width ({width}) × 2 overflows usize"), - }; - assert!(y.len() >= width, "y row too short"); - assert!(vu.len() >= vu_min, "vu row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::nv42_to_rgb_row(y, vu, rgb_out, width, matrix, full_range); -} - -/// Converts one row of NV24 (semi‑planar 4:4:4, UV-ordered) to packed -/// **RGBA** (8-bit). Same numerical contract as [`nv24_to_rgb_row`]; -/// alpha defaults to `0xFF` (opaque). -/// -/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces scalar. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv24_to_rgba_row( - y: &[u8], - uv: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - let uv_min = match width.checked_mul(2) { - Some(n) => n, - None => panic!("width ({width}) × 2 overflows usize"), - }; - assert!(y.len() >= width, "y row too short"); - assert!(uv.len() >= uv_min, "uv row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::nv24_to_rgba_row(y, uv, rgba_out, width, matrix, full_range); -} - -/// Converts one row of NV42 (semi‑planar 4:4:4, VU-ordered) to packed -/// **RGBA** (8-bit). Same as [`nv24_to_rgba_row`] but with swapped -/// chroma byte order. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn nv42_to_rgba_row( - y: &[u8], - vu: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - let vu_min = match width.checked_mul(2) { - Some(n) => n, - None => panic!("width ({width}) × 2 overflows usize"), - }; - assert!(y.len() >= width, "y row too short"); - assert!(vu.len() >= vu_min, "vu row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::nv42_to_rgba_row(y, vu, rgba_out, width, matrix, full_range); -} - -/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches -/// to the best available SIMD backend for the current target. -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference -/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma -/// planes, no chroma upsampling, no width parity constraint. See -/// `scalar::yuv_444_to_rgb_row` for the reference implementation. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_444_to_rgb_row( - y: &[u8], - u: &[u8], - v: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX-512BW verified. - unsafe { - arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); -} - -/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit). -/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only -/// differences are the per-pixel stride (4 vs 3) and the alpha byte -/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`. -/// `use_simd = false` forces scalar. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_444_to_rgba_row( - y: &[u8], - u: &[u8], - v: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const -/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best -/// available backend for the current target (NEON / SSE4.1 / AVX2 / -/// AVX-512 / wasm simd128), falling back to scalar when no SIMD -/// backend is available or `use_simd` is false. -/// -/// Crate-private — external callers use the concrete -/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] / -/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a -/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1` -/// truncates to `-1` when cast to `i16` in the SIMD clamp), and -/// matches the [`yuv420p10_to_rgb_row`] family's convention of -/// keeping the `` generic internal. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. -/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output. -/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 / -/// AVX-512 / wasm simd128), falling back to scalar when no SIMD -/// backend is available or `use_simd` is false. -/// -/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The -/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a -/// dedicated i64-chroma kernel family. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<9>`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<10>`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 12-bit → u8 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 14-bit → u8 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the -/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline -/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range -/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to -/// i64 to avoid i32 overflow at 16-bit limited range. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 9 active bits in the low bits of each -/// element. Niche format (AVC High 9 profile only). Reuses the same -/// `yuv_420p_n_to_rgb_row` kernel family as 10/12/14-bit; the -/// only per-call difference is the const-generic `BITS = 9` which -/// fixes the AND-mask to `0x1FF` and the Q15 scale via -/// `range_params_n::<9, 8>`. -/// -/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 10 active bits in the low bits of each -/// element. Output is packed `R, G, B` bytes (`3 * width` bytes), -/// with the conversion clamping to `[0, 255]` — the native‑depth -/// path is [`yuv420p10_to_rgb_u16_row`]. -/// -/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified on this CPU; bounds / parity are - // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed -/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`, -/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless -/// downstream HDR processing when the consumer expects low‑bit‑packed -/// samples. -/// -/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16` -/// elements, each in `[0, 1023]` with the upper 6 bits zero. -/// -/// This is **not** the FFmpeg `p010` layout — `p010` stores samples -/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers -/// feeding this output into a p010 consumer must shift left by 6 -/// before handing off. -/// -/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ -/// packed — 10 active bits in the high 10 of each `u16`) to packed -/// **8‑bit** RGB. -/// -/// This is the HDR hardware‑decode keystone format: VideoToolbox, -/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit -/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic -/// specification. `use_simd = false` forces the scalar reference. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P010 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P010** to **native‑depth `u16`** packed RGB -/// (10 active bits in the **low** 10 of each output `u16`, matching -/// `yuv420p10le` convention — **not** the P010 high‑bit packing). -/// Callers feeding this output into a P010 consumer must shift left -/// by 6. -/// -/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. -/// `use_simd = false` forces the scalar reference. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P010 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( - y, uv_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 12 active bits in the low 12 bits of each -/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed -/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The -/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching -/// `yuv420p12le` convention — upper 4 bits zero). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (14‑bit values in the low 14 of each `u16`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ -/// packed — 12 active bits in the high 12 of each `u16`) to packed -/// **8‑bit** RGB. -/// -/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and -/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but -/// `>> 4` instead of `>> 6` at each `u16` load. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P012 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P012** to **native‑depth `u16`** packed RGB -/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed -/// `yuv420p12le` convention, **not** P012's high‑bit packing). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P012 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( - y, uv_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. -/// -/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs -/// on the **i64 chroma** kernel family; see -/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** -/// packed `u16` RGB (full-range output in `[0, 65535]`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to -/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed -/// vs. low-bit-packed distinction (all bits are active). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P016 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P016** to **native-depth `u16`** packed RGB -/// (full-range output in `[0, 65535]`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P016 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); -} - -// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- -// -// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch -// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces -// the scalar reference path on every dispatcher. - -/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p9_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified on this CPU; bounds / parity are - // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p10_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, -/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to -/// `0xFF` (opaque). -/// -/// See `scalar::p_n_to_rgba_row::<10>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, -/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output -/// is low-bit-packed; alpha element is `(1 << 10) - 1`. -/// -/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p12_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p14_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, -/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to -/// `0xFF` (opaque). -/// -/// See `scalar::p_n_to_rgba_row::<12>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, -/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output -/// is low-bit-packed; alpha element is `(1 << 12) - 1`. -/// -/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// Routes through the dedicated 16-bit scalar kernel -/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient -/// for u8 output even at 16-bit input. `use_simd = false` forces the -/// scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — full-range output `[0, 65535]`; alpha element -/// is `0xFFFF` (opaque maximum at 16-bit). -/// -/// Routes through the dedicated 16-bit u16-output scalar kernel -/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply -/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit -/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. -/// -/// Routes through the dedicated 16-bit P016 scalar kernel -/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar -/// reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P016** to **native-depth `u16`** packed -/// **RGBA** — full-range output `[0, 65535]`; alpha element is -/// `0xFFFF`. -/// -/// Routes through the dedicated 16-bit u16-output P016 scalar kernel -/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); -} - -// ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB -------------------- -// -// Same shape as the 4:2:0 / 4:2:2 P-family kernels but with full-width -// interleaved UV (one `U, V` pair per pixel = `2 * width` u16 elements -// per row). BITS ∈ {10, 12} run on the const-generic Q15 i32 family; -// BITS = 16 runs on the dedicated parallel i64-chroma family -// (chroma multiply-add overflows i32 at 16-bit u16 output). - -/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → packed **u8** RGB -/// dispatcher. Const-generic over `BITS`; dispatches to the best -/// available backend (NEON / SSE4.1 / AVX2 / AVX-512 / wasm simd128), -/// falling back to scalar when no SIMD backend is available or -/// `use_simd` is false. -/// -/// Crate-private — public consumers go through the per-format -/// dispatchers (`p410_to_rgb_row`, `p412_to_rgb_row`) which fix -/// `BITS` to a literal. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn p_n_444_to_rgb_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX-512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile-time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgb_row::(y, uv_full, rgb_out, width, matrix, full_range); -} - -/// Pn 4:4:4 high-bit-packed (BITS ∈ {10, 12}) → native-depth **u16** -/// RGB dispatcher. Output is low-bit-packed (active bits in low -/// `BITS` of each `u16`). Same dispatch shape as -/// [`p_n_444_to_rgb_row`]. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn p_n_444_to_rgb_u16_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgb_u16_row::(y, uv_full, rgb_out, width, matrix, full_range); -} - -/// P416 (semi-planar 4:4:4, 16-bit) → packed **u8** RGB dispatcher. -/// Y stays on i32 (output-range scaling keeps `coeff × u_d` within -/// i32 for u8 output); chroma multiply-add also stays on i32. -/// Dedicated entry point because the Q15 const-generic family is -/// pinned to BITS ∈ {10, 12}. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p416_to_rgb_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_16_to_rgb_row(y, uv_full, rgb_out, width, matrix, full_range); -} - -/// P416 → native-depth **u16** RGB dispatcher (`[0, 65535]`). Chroma -/// multiply-add runs on i64 (overflow safety at 16-bit u16 output); -/// see scalar reference for the rationale. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p416_to_rgb_u16_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_16_to_rgb_u16_row(y, uv_full, rgb_out, width, matrix, full_range); -} - -/// P410 → packed u8 RGB. Thin wrapper at `BITS = 10`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p410_to_rgb_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - p_n_444_to_rgb_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); -} - -/// P410 → native-depth u16 RGB (10-bit low-packed output). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p410_to_rgb_u16_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - p_n_444_to_rgb_u16_row::<10>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); -} - -/// P412 → packed u8 RGB. Thin wrapper at `BITS = 12`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p412_to_rgb_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - p_n_444_to_rgb_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); -} - -/// P412 → native-depth u16 RGB (12-bit low-packed output). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p412_to_rgb_u16_row( - y: &[u16], - uv_full: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - p_n_444_to_rgb_u16_row::<12>(y, uv_full, rgb_out, width, matrix, full_range, use_simd); -} - -// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- -// -// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch -// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces -// the scalar reference path on every dispatcher. - -/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the -/// per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_444p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha -/// element is `1023`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha -/// element is `4095`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha -/// element is `16383`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit -/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is -/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar -/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); -} - -// ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- -// -// Per-row dispatchers for the YUVA source family (currently Yuva444p10 -// only). Both the u8 RGBA dispatcher (`yuva444p10_to_rgba_row`) and -// the u16 RGBA dispatcher (`yuva444p10_to_rgba_u16_row`) route through -// per-arch `yuv_444p_n_to_rgba*_with_alpha_src_row` SIMD wrappers, -// mirroring the `yuv444p10_to_rgba_row` / `yuv444p10_to_rgba_u16_row` -// dispatchers' patterns. - -/// Converts one row of **10-bit** YUVA 4:4:4 to packed **8-bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv444p10_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p10_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p10_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **10-bit** YUVA 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `1023`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv444p10_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva444p10_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u, v, a, rgba_out, width, matrix, full_range, - ); -} - -// ---- YUVA 4:2:0 RGBA dispatchers -------------------------------------- -// -// Per-row dispatchers for the YUVA 4:2:0 source family — Yuva420p -// (8-bit) plus Yuva420p9 / Yuva420p10 / Yuva420p16. The u8 RGBA -// dispatchers route through per-arch -// `yuv_420*_to_rgba*_with_alpha_src_row` SIMD wrappers (Ship 8b-2b), -// mirroring the non-alpha sibling dispatchers' `cfg_select!` blocks. -// The native-depth `u16` RGBA dispatchers below remain scalar pending -// Ship 8b-2c. - -/// Converts one row of 8‑bit YUVA 4:2:0 to packed **8‑bit** **RGBA**. -/// R / G / B are produced by the same Q15 i32 8‑bit kernel that backs -/// [`yuv_420_to_rgba_row`]; the per-pixel alpha byte is **sourced -/// from `a`** (one byte per pixel, full-width — alpha is at luma -/// resolution in 4:2:0, only chroma is subsampled). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv_420_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p_to_rgba_row( - y: &[u8], - u_half: &[u8], - v_half: &[u8], - a: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **9‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv420p9_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 1` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p9_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p9_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **9‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 511]`); the -/// per-pixel alpha element is **sourced from `a`** (already at the -/// source's native bit depth) instead of being the opaque maximum -/// `511`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p9_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p9_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<9>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **10‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same Q15 i32 kernel family -/// that backs [`yuv420p10_to_rgba_row`]; the per-pixel alpha byte is -/// **sourced from `a`** (depth-converted via `a >> 2` to fit `u8`) -/// instead of being constant `0xFF`. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p10_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p10_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **10‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); the -/// per-pixel alpha element is **sourced from `a`** at native depth. -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p10_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p10_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_with_alpha_src_row::<10>( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **16‑bit** YUVA 4:2:0 to packed **8‑bit** -/// **RGBA**. R / G / B are produced by the same i32 kernel that backs -/// [`yuv420p16_to_rgba_row`]; the per-pixel alpha byte is **sourced -/// from `a`** (depth-converted via `a >> 8` to fit `u8`). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p16_to_rgba_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p16_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// Converts one row of **16‑bit** YUVA 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — full-range output in `[0, 65535]`; the per-pixel -/// alpha element is **sourced from `a`** at native depth (no shift). -/// -/// `use_simd = false` forces the scalar reference path; otherwise -/// per-arch dispatch matches [`yuv420p16_to_rgba_u16_row`]'s pattern. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuva420p16_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - a: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(a.len() >= width, "a row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_u16_with_alpha_src_row( - y, u_half, v_half, a, rgba_out, width, matrix, full_range, - ); -} - -/// P410 (semi-planar 4:4:4, 10-bit high-packed) → packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p410_to_rgba_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgba_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// P410 → **native-depth `u16`** packed **RGBA** — output is -/// low-bit-packed (`[0, 1023]`); alpha element is `1023`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p410_to_rgba_u16_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgba_u16_row::<10>(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// P412 (semi-planar 4:4:4, 12-bit high-packed) → packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p412_to_rgba_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgba_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// P412 → **native-depth `u16`** packed **RGBA** — output is -/// low-bit-packed (`[0, 4095]`); alpha element is `4095`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p412_to_rgba_u16_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_to_rgba_u16_row::<12>(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// P416 (semi-planar 4:4:4, 16-bit) → packed **8-bit** **RGBA** -/// (`R, G, B, 0xFF`). Routes through the dedicated 16-bit scalar -/// kernel (`scalar::p_n_444_16_to_rgba_row`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p416_to_rgba_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_16_to_rgba_row(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// P416 → **native-depth `u16`** packed **RGBA** — full-range output -/// `[0, 65535]`; alpha element is `0xFFFF`. Routes through the -/// dedicated 16-bit u16-output scalar kernel -/// (`scalar::p_n_444_16_to_rgba_u16_row`) — i64 chroma multiply. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p416_to_rgba_u16_row( - y: &[u16], - uv_full: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - let uv_min = uv_full_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_full.len() >= uv_min, "uv_full row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); -} - -/// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit -/// encoding). See `scalar::rgb_to_hsv_row` for semantics. -/// -/// `use_simd = false` forces the scalar reference path, bypassing any -/// SIMD backend (same semantics as `yuv_420_to_rgb_row`). -#[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb_to_hsv_row( - rgb: &[u8], - h_out: &mut [u8], - s_out: &mut [u8], - v_out: &mut [u8], - width: usize, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary (see - // [`yuv_420_to_rgb_row`] for rationale, including the checked - // `width × 3` multiplication). - let rgb_min = rgb_row_bytes(width); - assert!(rgb.len() >= rgb_min, "rgb row too short"); - assert!(h_out.len() >= width, "h_out row too short"); - assert!(s_out.len() >= width, "s_out row too short"); - assert!(v_out.len() >= width, "v_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); - } - return; - } - }, - _ => { - // Targets without a SIMD HSV backend fall through to scalar. - } - } - } - - scalar::rgb_to_hsv_row(rgb, h_out, s_out, v_out, width); -} - -/// Rewrites a row of packed BGR to packed RGB by swapping the outer -/// two channels (byte 0 ↔ byte 2) of every triple. `input` and -/// `output` must not alias. -/// -/// The underlying transformation is self‑inverse, so -/// [`rgb_to_bgr_row`] shares the same implementation — use whichever -/// name reads more naturally at the call site. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -pub fn bgr_to_rgb_row(bgr: &[u8], rgb_out: &mut [u8], width: usize, use_simd: bool) { - swap_rb_channels_row(bgr, rgb_out, width, use_simd); -} - -/// Rewrites a row of packed RGB to packed BGR by swapping the outer -/// two channels. See [`bgr_to_rgb_row`] — this is an alias that reads -/// more naturally for the opposite direction. -#[cfg_attr(not(tarpaulin), inline(always))] -pub fn rgb_to_bgr_row(rgb: &[u8], bgr_out: &mut [u8], width: usize, use_simd: bool) { - swap_rb_channels_row(rgb, bgr_out, width, use_simd); -} - -/// Shared dispatcher behind `bgr_to_rgb_row` / `rgb_to_bgr_row`. -#[cfg_attr(not(tarpaulin), inline(always))] -fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd: bool) { - // Runtime asserts at the dispatcher boundary (see - // [`yuv_420_to_rgb_row`] for rationale, including the checked - // `width × 3` multiplication). - let rgb_min = rgb_row_bytes(width); - assert!(input.len() >= rgb_min, "input row too short"); - assert!(output.len() >= rgb_min, "output row too short"); +pub(crate) mod arch; +pub(crate) mod scalar; +mod dispatch; - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::bgr_rgb_swap_row(input, output, width); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - unsafe { - arch::x86_avx512::bgr_rgb_swap_row(input, output, width); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 just verified. - unsafe { - arch::x86_avx2::bgr_rgb_swap_row(input, output, width); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 just verified. - unsafe { - arch::x86_sse41::bgr_rgb_swap_row(input, output, width); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::bgr_rgb_swap_row(input, output, width); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } +// Re-exported only when a caller is compiled. The `MixedSinker` Strategy A +// fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which +// is gated on `feature = "std"` / `feature = "alloc"` (needs `Vec`). Without +// either feature both this re-export and the underlying scalar function would +// be unused, which is a hard error under `cargo clippy -- -D warnings`. +#[cfg(any(feature = "std", feature = "alloc"))] +pub(crate) use scalar::expand_rgb_to_rgba_row; +#[cfg(any(feature = "std", feature = "alloc"))] +pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row; - scalar::bgr_rgb_swap_row(input, output, width); -} +pub use dispatch::bayer::*; +pub use dispatch::nv::*; +pub use dispatch::pn::*; +pub use dispatch::rgb_ops::*; +pub use dispatch::yuv420::*; +pub use dispatch::yuv444::*; +pub use dispatch::yuva::*; + +// `yuv_444p_n_to_rgb_u16_row` is consumed by the 32-bit overflow test +// `yuv_444p_n_u16_dispatcher_rejects_width_times_3_overflow` below — +// the dispatch submodule keeps it as `pub(crate)`, so glob `pub use` +// doesn't pick it up. Gated on the same cfg the test uses to avoid +// `unused_imports` on builds that don't compile the test. +#[cfg(all(test, feature = "std", target_pointer_width = "32"))] +pub(crate) use dispatch::yuv444::yuv_444p_n_to_rgb_u16_row; // ---- shared dispatcher helpers --------------------------------------- @@ -6433,7 +80,7 @@ fn swap_rb_channels_row(input: &[u8], output: &mut [u8], width: usize, use_simd: /// multiplication here could admit an undersized buffer and trigger /// out‑of‑bounds writes downstream. #[cfg_attr(not(tarpaulin), inline(always))] -fn rgb_row_bytes(width: usize) -> usize { +pub(crate) fn rgb_row_bytes(width: usize) -> usize { match width.checked_mul(3) { Some(n) => n, None => panic!("width ({width}) × 3 overflows usize"), @@ -6444,7 +91,7 @@ fn rgb_row_bytes(width: usize) -> usize { /// checking. Same purpose as [`rgb_row_bytes`] for the 4-channel /// path used by the RGBA dispatchers. #[cfg_attr(not(tarpaulin), inline(always))] -fn rgba_row_bytes(width: usize) -> usize { +pub(crate) fn rgba_row_bytes(width: usize) -> usize { match width.checked_mul(4) { Some(n) => n, None => panic!("width ({width}) × 4 overflows usize"), @@ -6459,7 +106,7 @@ fn rgba_row_bytes(width: usize) -> usize { /// caller allocates, and downstream SIMD kernels index with it /// directly without re‑multiplying. #[cfg_attr(not(tarpaulin), inline(always))] -fn rgb_row_elems(width: usize) -> usize { +pub(crate) fn rgb_row_elems(width: usize) -> usize { match width.checked_mul(3) { Some(n) => n, None => panic!("width ({width}) × 3 overflows usize"), @@ -6471,7 +118,7 @@ fn rgb_row_elems(width: usize) -> usize { /// elements, not bytes. Callers use it to size `&mut [u16]` buffers /// for the high-bit-depth `u16` RGBA output path. #[cfg_attr(not(tarpaulin), inline(always))] -fn rgba_row_elems(width: usize) -> usize { +pub(crate) fn rgba_row_elems(width: usize) -> usize { match width.checked_mul(4) { Some(n) => n, None => panic!("width ({width}) × 4 overflows usize"), @@ -6514,7 +161,7 @@ pub(crate) const MAX_FUSED_TRANSFORM_ABS: f32 = 1.0e12; /// row-API callers and the dispatcher-level guarantee that /// matches what validated upstream inputs can produce. #[cfg_attr(not(tarpaulin), inline(always))] -fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) { +pub(crate) fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) { let mut row = 0; while row < 3 { let mut col = 0; @@ -6544,7 +191,7 @@ fn assert_color_transform_well_formed(m: &[[f32; 3]; 3]) { /// `assert!`, so an unchecked multiplication on 32-bit targets could /// silently admit an undersized buffer. #[cfg_attr(not(tarpaulin), inline(always))] -fn uv_full_row_elems(width: usize) -> usize { +pub(crate) fn uv_full_row_elems(width: usize) -> usize { match width.checked_mul(2) { Some(n) => n, None => panic!("width ({width}) × 2 overflows usize (UV row)"), @@ -6572,7 +219,7 @@ fn uv_full_row_elems(width: usize) -> usize { /// NEON availability on aarch64. #[cfg(all(target_arch = "aarch64", feature = "std"))] #[cfg_attr(not(tarpaulin), inline(always))] -fn neon_available() -> bool { +pub(crate) fn neon_available() -> bool { if cfg!(colconv_force_scalar) { return false; } @@ -6582,14 +229,14 @@ fn neon_available() -> bool { /// NEON availability on aarch64 — no‑std variant (compile‑time). #[cfg(all(target_arch = "aarch64", not(feature = "std")))] #[cfg_attr(not(tarpaulin), inline(always))] -const fn neon_available() -> bool { +pub(crate) const fn neon_available() -> bool { !cfg!(colconv_force_scalar) && cfg!(target_feature = "neon") } /// AVX2 availability on x86_64. #[cfg(all(target_arch = "x86_64", feature = "std"))] #[cfg_attr(not(tarpaulin), inline(always))] -fn avx2_available() -> bool { +pub(crate) fn avx2_available() -> bool { if cfg!(colconv_force_scalar) || cfg!(colconv_disable_avx2) { return false; } @@ -6599,14 +246,14 @@ fn avx2_available() -> bool { /// AVX2 availability on x86_64 — no‑std variant (compile‑time). #[cfg(all(target_arch = "x86_64", not(feature = "std")))] #[cfg_attr(not(tarpaulin), inline(always))] -const fn avx2_available() -> bool { +pub(crate) const fn avx2_available() -> bool { !cfg!(colconv_force_scalar) && !cfg!(colconv_disable_avx2) && cfg!(target_feature = "avx2") } /// SSE4.1 availability on x86_64. #[cfg(all(target_arch = "x86_64", feature = "std"))] #[cfg_attr(not(tarpaulin), inline(always))] -fn sse41_available() -> bool { +pub(crate) fn sse41_available() -> bool { if cfg!(colconv_force_scalar) { return false; } @@ -6616,14 +263,14 @@ fn sse41_available() -> bool { /// SSE4.1 availability on x86_64 — no‑std variant (compile‑time). #[cfg(all(target_arch = "x86_64", not(feature = "std")))] #[cfg_attr(not(tarpaulin), inline(always))] -const fn sse41_available() -> bool { +pub(crate) const fn sse41_available() -> bool { !cfg!(colconv_force_scalar) && cfg!(target_feature = "sse4.1") } /// AVX‑512 (F + BW) availability on x86_64. #[cfg(all(target_arch = "x86_64", feature = "std"))] #[cfg_attr(not(tarpaulin), inline(always))] -fn avx512_available() -> bool { +pub(crate) fn avx512_available() -> bool { if cfg!(colconv_force_scalar) || cfg!(colconv_disable_avx512) { return false; } @@ -6634,7 +281,7 @@ fn avx512_available() -> bool { /// (compile‑time). #[cfg(all(target_arch = "x86_64", not(feature = "std")))] #[cfg_attr(not(tarpaulin), inline(always))] -const fn avx512_available() -> bool { +pub(crate) const fn avx512_available() -> bool { !cfg!(colconv_force_scalar) && !cfg!(colconv_disable_avx512) && cfg!(target_feature = "avx512bw") } @@ -6643,162 +290,9 @@ const fn avx512_available() -> bool { /// a compile‑time check regardless of the `std` feature. #[cfg(target_arch = "wasm32")] #[cfg_attr(not(tarpaulin), inline(always))] -const fn simd128_available() -> bool { +pub(crate) const fn simd128_available() -> bool { !cfg!(colconv_force_scalar) && cfg!(target_feature = "simd128") } - -/// Converts one row of an 8-bit Bayer plane to packed RGB. -/// -/// Dispatches to the best available backend for the current target. -/// See [`scalar::bayer_to_rgb_row`] for the full semantic specification -/// (bilinear demosaic geometry, edge handling, output layout). -/// -/// `above` / `mid` / `below` are row-aligned slices into the source -/// Bayer plane via the **mirror-by-2** boundary contract: at the -/// top edge the caller supplies `above = mid_row(1)`, at the bottom -/// edge `below = mid_row(h - 2)`; replicate fallback only when -/// `height < 2`. See [`crate::raw::BayerRow::above`] for the full -/// rationale (CFA-parity preservation across boundaries). -/// `above` / `mid` / `below` must all be the same length — that -/// length is the row's pixel width. -/// -/// `m` is the precomputed `CCM · diag(wb)` 3×3 transform. Every -/// element must be finite (not NaN, not ±∞); the dispatcher -/// asserts this at the boundary so future unsafe SIMD kernels can -/// trust the contract. -/// -/// `rgb_out` must have at least `3 * mid.len()` bytes. -/// -/// **`use_simd` is currently a no-op.** All Bayer paths run the -/// scalar reference today; per-arch SIMD backends (NEON / SSE4.1 / -/// AVX2 / AVX-512 / wasm simd128) ship in a follow-up. The -/// parameter is wired through `MixedSinker` and the public -/// dispatchers now so callers don't have to touch their call sites -/// when SIMD lands. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn bayer_to_rgb_row( - above: &[u8], - mid: &[u8], - below: &[u8], - row_parity: u32, - pattern: crate::raw::BayerPattern, - demosaic: crate::raw::BayerDemosaic, - m: &[[f32; 3]; 3], - rgb_out: &mut [u8], - _use_simd: bool, -) { - // Release-mode preflight: future unsafe SIMD backends will rely on - // these invariants for bounds-free pointer arithmetic, so we - // validate here rather than only via `debug_assert!` inside the - // scalar kernel. Same pattern as `yuv_420_to_rgb_row`. - let width = mid.len(); - assert_eq!(above.len(), width, "above row length must match mid"); - assert_eq!(below.len(), width, "below row length must match mid"); - let rgb_min = rgb_row_bytes(width); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - assert_color_transform_well_formed(m); - - scalar::bayer_to_rgb_row(above, mid, below, row_parity, pattern, demosaic, m, rgb_out); -} - -/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer -/// plane to packed `u8` RGB. -/// -/// `BITS` ∈ {10, 12, 14, 16}; samples are low-packed `u16` (active -/// values in the low `BITS` bits, range `[0, (1 << BITS) - 1]`). -/// Direct row-API callers are responsible for upholding the -/// low-packed contract; samples whose value exceeds -/// `(1 << BITS) - 1` produce defined-but-saturated output (no -/// panic, no UB). The walker -/// [`crate::raw::bayer16_to`] never sees out-of-range input -/// because [`crate::frame::BayerFrame16::try_new`] validates every -/// active sample at frame-construction time. -/// -/// `m` is the unscaled `CCM · diag(wb)` — the kernel bakes the -/// input→u8 rescale (`255 / ((1 << BITS) - 1)`) at output time. -/// `above` / `mid` / `below` must all be the same length; -/// `rgb_out` must have at least `3 * mid.len()` bytes. -/// -/// **`use_simd` is currently a no-op** (see -/// [`bayer_to_rgb_row`] for the deferred-SIMD note). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn bayer16_to_rgb_row( - above: &[u16], - mid: &[u16], - below: &[u16], - row_parity: u32, - pattern: crate::raw::BayerPattern, - demosaic: crate::raw::BayerDemosaic, - m: &[[f32; 3]; 3], - rgb_out: &mut [u8], - _use_simd: bool, -) { - const { - assert!( - BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16, - "bayer16_to_rgb_row: BITS must be 10, 12, 14, or 16" - ) - }; - let width = mid.len(); - assert_eq!(above.len(), width, "above row length must match mid"); - assert_eq!(below.len(), width, "below row length must match mid"); - let rgb_min = rgb_row_bytes(width); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - assert_color_transform_well_formed(m); - - scalar::bayer16_to_rgb_row::(above, mid, below, row_parity, pattern, demosaic, m, rgb_out); -} - -/// Converts one row of a 10/12/14/16-bit **low-packed** Bayer -/// plane to packed `u16` RGB (also low-packed at `BITS`). -/// -/// `BITS` ∈ {10, 12, 14, 16}. Input and output share the same -/// low-packed range `[0, (1 << BITS) - 1]` per channel — no -/// rescale, just clamp. `above` / `mid` / `below` must all be the -/// same length; `rgb_out` must have at least `3 * mid.len()` `u16` -/// elements. -/// -/// Direct row-API callers are responsible for upholding the -/// low-packed contract — see [`bayer16_to_rgb_row`] for the -/// full rationale on the safe path -/// ([`crate::frame::BayerFrame16::try_new`] + [`crate::raw::bayer16_to`]) -/// vs. the direct row API. -/// -/// **`use_simd` is currently a no-op** (see -/// [`bayer_to_rgb_row`] for the deferred-SIMD note). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn bayer16_to_rgb_u16_row( - above: &[u16], - mid: &[u16], - below: &[u16], - row_parity: u32, - pattern: crate::raw::BayerPattern, - demosaic: crate::raw::BayerDemosaic, - m: &[[f32; 3]; 3], - rgb_out: &mut [u16], - _use_simd: bool, -) { - const { - assert!( - BITS == 10 || BITS == 12 || BITS == 14 || BITS == 16, - "bayer16_to_rgb_u16_row: BITS must be 10, 12, 14, or 16" - ) - }; - let width = mid.len(); - assert_eq!(above.len(), width, "above row length must match mid"); - assert_eq!(below.len(), width, "below row length must match mid"); - let rgb_min = rgb_row_elems(width); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - assert_color_transform_well_formed(m); - - scalar::bayer16_to_rgb_u16_row::( - above, mid, below, row_parity, pattern, demosaic, m, rgb_out, - ); -} - #[cfg(all(test, feature = "std"))] mod overflow_tests { //! 32-bit RGB-row-bytes overflow regressions for the public From fd9edbad4b783e06b24a7b4e33df8e977a2c8562 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:41:52 +1200 Subject: [PATCH 3/6] refactor(row): split dispatch/yuv420.rs and yuv444.rs into per-format directories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `src/row/dispatch/yuv420.rs` (2698 lines) and `yuv444.rs` (1333 lines) were the two largest files left after the previous split. Split each into a subdirectory with one file per source format: ``` src/row/dispatch/yuv420/ mod.rs (re-exports + module decls, 31 lines) yuv_420.rs (8-bit YUV 4:2:0 RGB / RGBA, 222 lines) yuv420p9.rs (4 variants, 360 lines) yuv420p10.rs (4 variants, 367 lines) yuv420p12.rs (4 variants, 343 lines) yuv420p14.rs (4 variants, 332 lines) yuv420p16.rs (4 variants, 291 lines) p010.rs (P010 4:2:0 semi-planar, 312 lines) p012.rs (P012, 296 lines) p016.rs (P016, 279 lines) src/row/dispatch/yuv444/ mod.rs (re-exports + pub(crate) BITS-generic helpers `yuv_444p_n_to_rgb_row` / `yuv_444p_n_to_rgb_u16_row` shared by 9/10/12/14 wrappers, 197 lines) yuv_444.rs (8-bit YUV 4:4:4 RGB / RGBA, 159 lines) yuv444p9.rs (thin RGB wrappers + full RGBA dispatchers, 209 lines) yuv444p10.rs (193 lines) yuv444p12.rs (192 lines) yuv444p14.rs (192 lines) yuv444p16.rs (full dispatchers — BITS-generic template pinned to {9,10,12,14}, so 16-bit gets its own, 304 lines) ``` No semantic changes — function bodies were extracted byte-for-byte via `sed -n` from the prior single-file modules. The only edits were: - Per-file `use` lines trimmed to what each file actually needs (e.g. 8-bit dispatchers don't import `rgb_row_elems` / `rgba_row_elems`; the BITS-generic helper file in yuv444 doesn't need `rgba_row_*`). - `yuv444/p9.rs`-`p14.rs` add `use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row};` so the thin wrappers reach the helpers in the sibling `yuv444/mod.rs`. - Parent `dispatch/mod.rs` is unchanged — the existing `pub(super) mod yuv420; pub(super) mod yuv444;` declarations resolve to the new `yuv420/mod.rs` / `yuv444/mod.rs` files. The maximum file size in `src/row/dispatch/` is now 845 lines (`yuva.rs`); after dropping yuv420.rs/yuv444.rs the largest YUV files are 367 / 304 lines. Verified across aarch64-apple-darwin, x86_64-unknown-freebsd, and wasm32-unknown-unknown: - `cargo check --lib --tests`: clean - `RUSTFLAGS=-Dwarnings cargo clippy --lib --tests`: clean - `cargo test --lib` (host): 629 passed (same as before) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/yuv420.rs | 2698 -------------------------- src/row/dispatch/yuv420/mod.rs | 31 + src/row/dispatch/yuv420/p010.rs | 312 +++ src/row/dispatch/yuv420/p012.rs | 296 +++ src/row/dispatch/yuv420/p016.rs | 279 +++ src/row/dispatch/yuv420/yuv420p10.rs | 367 ++++ src/row/dispatch/yuv420/yuv420p12.rs | 343 ++++ src/row/dispatch/yuv420/yuv420p14.rs | 332 ++++ src/row/dispatch/yuv420/yuv420p16.rs | 291 +++ src/row/dispatch/yuv420/yuv420p9.rs | 360 ++++ src/row/dispatch/yuv420/yuv_420.rs | 222 +++ src/row/dispatch/yuv444.rs | 1333 ------------- src/row/dispatch/yuv444/mod.rs | 197 ++ src/row/dispatch/yuv444/yuv444p10.rs | 193 ++ src/row/dispatch/yuv444/yuv444p12.rs | 192 ++ src/row/dispatch/yuv444/yuv444p14.rs | 192 ++ src/row/dispatch/yuv444/yuv444p16.rs | 304 +++ src/row/dispatch/yuv444/yuv444p9.rs | 209 ++ src/row/dispatch/yuv444/yuv_444.rs | 159 ++ 19 files changed, 4279 insertions(+), 4031 deletions(-) delete mode 100644 src/row/dispatch/yuv420.rs create mode 100644 src/row/dispatch/yuv420/mod.rs create mode 100644 src/row/dispatch/yuv420/p010.rs create mode 100644 src/row/dispatch/yuv420/p012.rs create mode 100644 src/row/dispatch/yuv420/p016.rs create mode 100644 src/row/dispatch/yuv420/yuv420p10.rs create mode 100644 src/row/dispatch/yuv420/yuv420p12.rs create mode 100644 src/row/dispatch/yuv420/yuv420p14.rs create mode 100644 src/row/dispatch/yuv420/yuv420p16.rs create mode 100644 src/row/dispatch/yuv420/yuv420p9.rs create mode 100644 src/row/dispatch/yuv420/yuv_420.rs delete mode 100644 src/row/dispatch/yuv444.rs create mode 100644 src/row/dispatch/yuv444/mod.rs create mode 100644 src/row/dispatch/yuv444/yuv444p10.rs create mode 100644 src/row/dispatch/yuv444/yuv444p12.rs create mode 100644 src/row/dispatch/yuv444/yuv444p14.rs create mode 100644 src/row/dispatch/yuv444/yuv444p16.rs create mode 100644 src/row/dispatch/yuv444/yuv444p9.rs create mode 100644 src/row/dispatch/yuv444/yuv_444.rs diff --git a/src/row/dispatch/yuv420.rs b/src/row/dispatch/yuv420.rs deleted file mode 100644 index 8f34dca1..00000000 --- a/src/row/dispatch/yuv420.rs +++ /dev/null @@ -1,2698 +0,0 @@ -//! YUV 4:2:0 dispatchers (planar and P010/P012/P016 semi-planar) — -//! 8-bit YUV → RGB/RGBA, 9/10/12/14/16-bit planar yuv420p_n RGB+RGBA, -//! P010/P012/P016 semi-planar RGB+RGBA. Extracted from `row::mod` for -//! organization. -//! -//! All dispatchers route through the standard `cfg_select!` per-arch -//! block; `use_simd = false` forces scalar. - -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; -#[cfg(target_arch = "aarch64")] -use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; -#[cfg(target_arch = "wasm32")] -use crate::row::simd128_available; -use crate::ColorMatrix; - -/// Converts one row of 4:2:0 YUV to packed RGB. -/// -/// Dispatches to the best available backend for the current target. -/// See `scalar::yuv_420_to_rgb_row` for the full semantic -/// specification (range handling, matrix definitions, output layout). -/// -/// `use_simd = false` forces the scalar reference path, bypassing any -/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD -/// directly on the same input; production code should pass `true`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_420_to_rgb_row( - y: &[u8], - u_half: &[u8], - v_half: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary. The unsafe SIMD - // kernels below rely on these invariants for bounds‑free pointer - // arithmetic, so we validate in *release* builds too — not just - // under `debug_assert!`. Kernels keep their own `debug_assert!`s as - // internal sanity checks. - // - // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize` - // on 32‑bit targets (wasm32, i686) for extreme widths. Without the - // guard, a wrapped product could admit an undersized `rgb_out` and - // let the scalar loop's `x * 3` indexing or a SIMD kernel's - // pointer arithmetic run off the end. - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present on this - // CPU. Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference); they are checked - // with `debug_assert` in debug builds. - unsafe { - arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - // Bounds / parity invariants are the caller's obligation. - unsafe { - arch::x86_avx512::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: `avx2_available()` verified AVX2 is present on this - // CPU. Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference); they are checked - // with `debug_assert` in debug builds. - unsafe { - arch::x86_avx2::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: `sse41_available()` verified SSE4.1 is present. - // Bounds / parity invariants are the caller's obligation - // (same contract as the scalar reference). - unsafe { - arch::x86_sse41::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below - // SSE4.1) slot in here, each branch guarded by the matching - // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair. - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: `simd128_available()` (compile‑time - // `cfg!(target_feature = "simd128")`) verified that simd128 - // is on. WASM has no runtime detection — the module's SIMD - // support is fixed at produce‑time. Bounds / parity - // invariants are the caller's obligation. - unsafe { - arch::wasm_simd128::yuv_420_to_rgb_row( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => { - // Targets without a SIMD backend (riscv64, powerpc, …) fall - // through to the scalar path below. - } - } - } - - scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit). -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only -/// differences are the per-pixel stride (4 vs 3) and the alpha byte -/// (`0xFF`, opaque, for every pixel — sources without an alpha plane -/// produce opaque output). The first three bytes per pixel are -/// byte-identical to what [`yuv_420_to_rgb_row`] would write. -/// -/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the -/// scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_420_to_rgba_row( - y: &[u8], - u_half: &[u8], - v_half: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - // Runtime asserts at the dispatcher boundary — see - // [`yuv_420_to_rgb_row`] for rationale, including the checked - // `width × 4` multiplication via [`rgba_row_bytes`]. - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: `avx512_available()` verified AVX‑512BW is present. - unsafe { - arch::x86_avx512::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: `avx2_available()` verified AVX2 is present. - unsafe { - arch::x86_avx2::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: `sse41_available()` verified SSE4.1 is present. - unsafe { - arch::x86_sse41::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time availability verified. - unsafe { - arch::wasm_simd128::yuv_420_to_rgba_row( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => { - // Targets without a SIMD backend fall through to scalar. - } - } - } - - scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 9 active bits in the low bits of each -/// element. Niche format (AVC High 9 profile only). Reuses the same -/// `yuv_420p_n_to_rgb_row` kernel family as 10/12/14-bit; the -/// only per-call difference is the const-generic `BITS = 9` which -/// fixes the AND-mask to `0x1FF` and the Q15 scale via -/// `range_params_n::<9, 8>`. -/// -/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 10 active bits in the low bits of each -/// element. Output is packed `R, G, B` bytes (`3 * width` bytes), -/// with the conversion clamping to `[0, 255]` — the native‑depth -/// path is [`yuv420p10_to_rgb_u16_row`]. -/// -/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified on this CPU; bounds / parity are - // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed -/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`, -/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless -/// downstream HDR processing when the consumer expects low‑bit‑packed -/// samples. -/// -/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16` -/// elements, each in `[0, 1023]` with the upper 6 bits zero. -/// -/// This is **not** the FFmpeg `p010` layout — `p010` stores samples -/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers -/// feeding this output into a p010 consumer must shift left by 6 -/// before handing off. -/// -/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic -/// specification. `use_simd = false` forces the scalar reference -/// path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ -/// packed — 10 active bits in the high 10 of each `u16`) to packed -/// **8‑bit** RGB. -/// -/// This is the HDR hardware‑decode keystone format: VideoToolbox, -/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit -/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic -/// specification. `use_simd = false` forces the scalar reference. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P010 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P010** to **native‑depth `u16`** packed RGB -/// (10 active bits in the **low** 10 of each output `u16`, matching -/// `yuv420p10le` convention — **not** the P010 high‑bit packing). -/// Callers feeding this output into a P010 consumer must shift left -/// by 6. -/// -/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. -/// `use_simd = false` forces the scalar reference. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P010 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( - y, uv_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -/// -/// Samples are `u16` with 12 active bits in the low 12 bits of each -/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed -/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The -/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching -/// `yuv420p12le` convention — upper 4 bits zero). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed -/// `u16` RGB (14‑bit values in the low 14 of each `u16`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( - y, u_half, v_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ -/// packed — 12 active bits in the high 12 of each `u16`) to packed -/// **8‑bit** RGB. -/// -/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and -/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but -/// `>> 4` instead of `>> 6` at each `u16` load. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P012 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P012** to **native‑depth `u16`** packed RGB -/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed -/// `yuv420p12le` convention, **not** P012's high‑bit packing). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P012 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( - y, uv_half, rgb_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. -/// -/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs -/// on the **i64 chroma** kernel family; see -/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** -/// packed `u16` RGB (full-range output in `[0, 65535]`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgb_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to -/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed -/// vs. low-bit-packed distinction (all bits are active). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P016 requires even width"); - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); -} - -/// Converts one row of **P016** to **native-depth `u16`** packed RGB -/// (full-range output in `[0, 65535]`). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgb_u16_row( - y: &[u16], - uv_half: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "P016 requires even width"); - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); -} -// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- -// -// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch -// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces -// the scalar reference path on every dispatcher. - -/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p9_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified on this CPU; bounds / parity are - // the caller's obligation (asserted above). - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p9_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p10_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p10_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, -/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to -/// `0xFF` (opaque). -/// -/// See `scalar::p_n_to_rgba_row::<10>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, -/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output -/// is low-bit-packed; alpha element is `(1 << 10) - 1`. -/// -/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p010_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p12_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p12_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv420p14_to_rgb_row`] except -/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_420p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p14_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( - y, u_half, v_half, rgba_out, width, matrix, full_range, - ); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, -/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to -/// `0xFF` (opaque). -/// -/// See `scalar::p_n_to_rgba_row::<12>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, -/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output -/// is low-bit-packed; alpha element is `(1 << 12) - 1`. -/// -/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p012_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// Routes through the dedicated 16-bit scalar kernel -/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient -/// for u8 output even at 16-bit input. `use_simd = false` forces the -/// scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** -/// packed **RGBA** — full-range output `[0, 65535]`; alpha element -/// is `0xFFFF` (opaque maximum at 16-bit). -/// -/// Routes through the dedicated 16-bit u16-output scalar kernel -/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply -/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv420p16_to_rgba_u16_row( - y: &[u16], - u_half: &[u16], - v_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u_half.len() >= width / 2, "u_half row too short"); - assert!(v_half.len() >= width / 2, "v_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit -/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. -/// -/// Routes through the dedicated 16-bit P016 scalar kernel -/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar -/// reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **P016** to **native-depth `u16`** packed -/// **RGBA** — full-range output `[0, 65535]`; alpha element is -/// `0xFFFF`. -/// -/// Routes through the dedicated 16-bit u16-output P016 scalar kernel -/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn p016_to_rgba_u16_row( - y: &[u16], - uv_half: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(uv_half.len() >= width, "uv_half row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); -} diff --git a/src/row/dispatch/yuv420/mod.rs b/src/row/dispatch/yuv420/mod.rs new file mode 100644 index 00000000..972210e7 --- /dev/null +++ b/src/row/dispatch/yuv420/mod.rs @@ -0,0 +1,31 @@ +//! YUV 4:2:0 dispatchers, split per source format for readability. +//! +//! - `yuv_420` — 8-bit YUV 4:2:0 → RGB / RGBA. +//! - `yuv420p9` / `yuv420p10` / `yuv420p12` / `yuv420p14` / +//! `yuv420p16` — high-bit planar 4:2:0 (4 variants per format: +//! RGB, RGB-u16, RGBA, RGBA-u16). +//! - `p010` / `p012` / `p016` — high-bit semi-planar 4:2:0 +//! (4 variants per format). +//! +//! Public functions re-exported up to `crate::row::*` via parent +//! `dispatch/mod.rs`. + +pub(super) mod p010; +pub(super) mod p012; +pub(super) mod p016; +pub(super) mod yuv420p10; +pub(super) mod yuv420p12; +pub(super) mod yuv420p14; +pub(super) mod yuv420p16; +pub(super) mod yuv420p9; +pub(super) mod yuv_420; + +pub use p010::*; +pub use p012::*; +pub use p016::*; +pub use yuv420p10::*; +pub use yuv420p12::*; +pub use yuv420p14::*; +pub use yuv420p16::*; +pub use yuv420p9::*; +pub use yuv_420::*; diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs new file mode 100644 index 00000000..35f9e548 --- /dev/null +++ b/src/row/dispatch/yuv420/p010.rs @@ -0,0 +1,312 @@ +//! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4 +//! variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ +/// packed — 10 active bits in the high 10 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// This is the HDR hardware‑decode keystone format: VideoToolbox, +/// VA‑API, NVDEC, D3D11VA, and Intel QSV all emit P010 for 10‑bit +/// output. See `scalar::p_n_to_rgb_row::<10>` for the full semantic +/// specification. `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P010** to **native‑depth `u16`** packed RGB +/// (10 active bits in the **low** 10 of each output `u16`, matching +/// `yuv420p10le` convention — **not** the P010 high‑bit packing). +/// Callers feeding this output into a P010 consumer must shift left +/// by 6. +/// +/// See `scalar::p_n_to_rgb_u16_row::<10>` for the full spec. +/// `use_simd = false` forces the scalar reference. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P010 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<10>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 10) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<10>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p010_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_u16_row::<10>(y, uv_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs new file mode 100644 index 00000000..618bc8f6 --- /dev/null +++ b/src/row/dispatch/yuv420/p012.rs @@ -0,0 +1,296 @@ +//! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4 +//! variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ +/// packed — 12 active bits in the high 12 of each `u16`) to packed +/// **8‑bit** RGB. +/// +/// P012 is the 12‑bit sibling of P010, emitted by HEVC Main 12 and +/// VP9 Profile 3 hardware decoders. Same shift semantics as P010 but +/// `>> 4` instead of `>> 6` at each `u16` load. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P012** to **native‑depth `u16`** packed RGB +/// (12 active bits in the low 12 of each output `u16` — low‑bit‑packed +/// `yuv420p12le` convention, **not** P012's high‑bit packing). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P012 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p_n_to_rgb_u16_row::<12>( + y, uv_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to +/// `0xFF` (opaque). +/// +/// See `scalar::p_n_to_rgba_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, +/// high-bit-packed) to **native-depth `u16`** packed **RGBA** — output +/// is low-bit-packed; alpha element is `(1 << 12) - 1`. +/// +/// See `scalar::p_n_to_rgba_u16_row::<12>` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p012_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p_n_to_rgba_u16_row::<12>(y, uv_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs new file mode 100644 index 00000000..128aaf17 --- /dev/null +++ b/src/row/dispatch/yuv420/p016.rs @@ -0,0 +1,279 @@ +//! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to +/// packed **8-bit** RGB. At 16 bits there is no high-bit-packed +/// vs. low-bit-packed distinction (all bits are active). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P016 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgb_row(y, uv_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed RGB +/// (full-range output in `[0, 65535]`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgb_u16_row( + y: &[u16], + uv_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "P016 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit +/// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. +/// +/// Routes through the dedicated 16-bit P016 scalar kernel +/// (`scalar::p16_to_rgba_row`). `use_simd = false` forces the scalar +/// reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgba_row(y, uv_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **P016** to **native-depth `u16`** packed +/// **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. +/// +/// Routes through the dedicated 16-bit u16-output P016 scalar kernel +/// (`scalar::p16_to_rgba_u16_row`) — i64 chroma multiply. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn p016_to_rgba_u16_row( + y: &[u16], + uv_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "semi-planar 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(uv_half.len() >= width, "uv_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::p16_to_rgba_u16_row(y, uv_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs new file mode 100644 index 00000000..27f7a1ff --- /dev/null +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -0,0 +1,367 @@ +//! 10-bit planar YUV 4:2:0 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 10 active bits in the low bits of each +/// element. Output is packed `R, G, B` bytes (`3 * width` bytes), +/// with the conversion clamping to `[0, 255]` — the native‑depth +/// path is [`yuv420p10_to_rgb_u16_row`]. +/// +/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **10‑bit** YUV 4:2:0 to **native‑depth** packed +/// RGB `u16` (10‑bit values in the **low** 10 bits of each `u16`, +/// matching FFmpeg's `yuv420p10le` convention). Use this for lossless +/// downstream HDR processing when the consumer expects low‑bit‑packed +/// samples. +/// +/// Output is packed `R, G, B` triples: `rgb_out[3 * width]` `u16` +/// elements, each in `[0, 1023]` with the upper 6 bits zero. +/// +/// This is **not** the FFmpeg `p010` layout — `p010` stores samples +/// in the **high** 10 bits of each `u16` (`sample << 6`). Callers +/// feeding this output into a p010 consumer must shift left by 6 +/// before handing off. +/// +/// See `scalar::yuv_420p_n_to_rgb_u16_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<10>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p10_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 10) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 10) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p10_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<10>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<10>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs new file mode 100644 index 00000000..9d250c9a --- /dev/null +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -0,0 +1,343 @@ +//! 12-bit planar YUV 4:2:0 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 12 active bits in the low 12 bits of each +/// element (low‑bit‑packed `yuv420p12le` convention). Output is packed +/// `R, G, B` bytes (`3 * width` bytes), clamping to `[0, 255]`. The +/// native‑depth path is [`yuv420p12_to_rgb_u16_row`]. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **12‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (12‑bit values in the **low** 12 of each `u16`, matching +/// `yuv420p12le` convention — upper 4 bits zero). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<12>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p12_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 12) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 12) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p12_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<12>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<12>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs new file mode 100644 index 00000000..a1c8024f --- /dev/null +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -0,0 +1,332 @@ +//! 14-bit planar YUV 4:2:0 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **14‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (14‑bit values in the low 14 of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<14>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p14_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 14) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 14) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p14_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<14>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<14>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs new file mode 100644 index 00000000..7b324e7d --- /dev/null +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -0,0 +1,291 @@ +//! 16-bit planar YUV 4:2:0 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. +/// +/// Samples are `u16` over the full 16-bit range (`[0, 65535]`). Runs +/// on the **i64 chroma** kernel family; see +/// [`scalar::yuv_420p16_to_rgb_row`] for the numerical contract. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth** +/// packed `u16` RGB (full-range output in `[0, 65535]`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// Routes through the dedicated 16-bit scalar kernel +/// (`scalar::yuv_420p16_to_rgba_row`) — i32 chroma family is sufficient +/// for u8 output even at 16-bit input. `use_simd = false` forces the +/// scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element +/// is `0xFFFF` (opaque maximum at 16-bit). +/// +/// Routes through the dedicated 16-bit u16-output scalar kernel +/// (`scalar::yuv_420p16_to_rgba_u16_row`) — uses i64 chroma multiply +/// for the wider `coeff × u_d` product at 16 → 16-bit scaling. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p16_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p16_to_rgba_u16_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs new file mode 100644 index 00000000..c28da34a --- /dev/null +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -0,0 +1,360 @@ +//! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16, +//! RGBA, RGBA-u16). + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. +/// +/// Samples are `u16` with 9 active bits in the low bits of each +/// element. Niche format (AVC High 9 profile only). Reuses the same +/// `yuv_420p_n_to_rgb_row` kernel family as 10/12/14-bit; the +/// only per-call difference is the const-generic `BITS = 9` which +/// fixes the AND-mask to `0x1FF` and the Q15 scale via +/// `range_params_n::<9, 8>`. +/// +/// See `scalar::yuv_420p_n_to_rgb_row` for the full semantic +/// specification. `use_simd = false` forces the scalar reference +/// path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of **9‑bit** YUV 4:2:0 to **native‑depth** packed +/// `u16` RGB (9-bit values in the **low** 9 bits of each `u16`). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgb_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgb_u16_row::<9>( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgb_u16_row::<9>(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +// ---- High-bit 4:2:0 RGBA dispatchers (Ship 8 Tranche 5) --------------- +// +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 5a + 5b). `use_simd = false` forces +// the scalar reference path on every dispatcher. + +/// Converts one row of **9-bit** YUV 4:2:0 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv420p9_to_rgb_row`] except +/// for the per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_420p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified on this CPU; bounds / parity are + // the caller's obligation (asserted above). + unsafe { + arch::neon::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:2:0 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_420p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv420p9_to_rgba_u16_row( + y: &[u16], + u_half: &[u16], + v_half: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_420p_n_to_rgba_u16_row::<9>( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_420p_n_to_rgba_u16_row::<9>(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs new file mode 100644 index 00000000..6428c008 --- /dev/null +++ b/src/row/dispatch/yuv420/yuv_420.rs @@ -0,0 +1,222 @@ +//! 8-bit YUV 4:2:0 → RGB / RGBA dispatchers (`yuv_420_to_rgb_row`, +//! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420` +//! module per source format for organization. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of 4:2:0 YUV to packed RGB. +/// +/// Dispatches to the best available backend for the current target. +/// See `scalar::yuv_420_to_rgb_row` for the full semantic +/// specification (range handling, matrix definitions, output layout). +/// +/// `use_simd = false` forces the scalar reference path, bypassing any +/// SIMD backend. Benchmarks flip this to compare scalar vs SIMD +/// directly on the same input; production code should pass `true`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_420_to_rgb_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary. The unsafe SIMD + // kernels below rely on these invariants for bounds‑free pointer + // arithmetic, so we validate in *release* builds too — not just + // under `debug_assert!`. Kernels keep their own `debug_assert!`s as + // internal sanity checks. + // + // `rgb_min` uses `checked_mul` because `3 * width` can wrap `usize` + // on 32‑bit targets (wasm32, i686) for extreme widths. Without the + // guard, a wrapped product could admit an undersized `rgb_out` and + // let the scalar loop's `x * 3` indexing or a SIMD kernel's + // pointer arithmetic run off the end. + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present on this + // CPU. Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference); they are checked + // with `debug_assert` in debug builds. + unsafe { + arch::neon::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + // Bounds / parity invariants are the caller's obligation. + unsafe { + arch::x86_avx512::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present on this + // CPU. Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference); they are checked + // with `debug_assert` in debug builds. + unsafe { + arch::x86_avx2::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + // Bounds / parity invariants are the caller's obligation + // (same contract as the scalar reference). + unsafe { + arch::x86_sse41::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + // Future x86_64 tiers (avx512 promoted above AVX2, ssse3 below + // SSE4.1) slot in here, each branch guarded by the matching + // `is_x86_feature_detected!` / `cfg!(target_feature = ...)` pair. + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: `simd128_available()` (compile‑time + // `cfg!(target_feature = "simd128")`) verified that simd128 + // is on. WASM has no runtime detection — the module's SIMD + // support is fixed at produce‑time. Bounds / parity + // invariants are the caller's obligation. + unsafe { + arch::wasm_simd128::yuv_420_to_rgb_row( + y, u_half, v_half, rgb_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => { + // Targets without a SIMD backend (riscv64, powerpc, …) fall + // through to the scalar path below. + } + } + } + + scalar::yuv_420_to_rgb_row(y, u_half, v_half, rgb_out, width, matrix, full_range); +} + +/// Converts one row of 4:2:0 YUV to packed **RGBA** (8-bit). +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel — sources without an alpha plane +/// produce opaque output). The first three bytes per pixel are +/// byte-identical to what [`yuv_420_to_rgb_row`] would write. +/// +/// `rgba_out.len() >= 4 * width`. `use_simd = false` forces the +/// scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_420_to_rgba_row( + y: &[u8], + u_half: &[u8], + v_half: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + // Runtime asserts at the dispatcher boundary — see + // [`yuv_420_to_rgb_row`] for rationale, including the checked + // `width × 4` multiplication via [`rgba_row_bytes`]. + assert_eq!(width & 1, 0, "YUV 4:2:0 requires even width"); + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u_half.len() >= width / 2, "u_half row too short"); + assert!(v_half.len() >= width / 2, "v_half row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: `avx512_available()` verified AVX‑512BW is present. + unsafe { + arch::x86_avx512::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if avx2_available() { + // SAFETY: `avx2_available()` verified AVX2 is present. + unsafe { + arch::x86_avx2::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + if sse41_available() { + // SAFETY: `sse41_available()` verified SSE4.1 is present. + unsafe { + arch::x86_sse41::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time availability verified. + unsafe { + arch::wasm_simd128::yuv_420_to_rgba_row( + y, u_half, v_half, rgba_out, width, matrix, full_range, + ); + } + return; + } + }, + _ => { + // Targets without a SIMD backend fall through to scalar. + } + } + } + + scalar::yuv_420_to_rgba_row(y, u_half, v_half, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444.rs b/src/row/dispatch/yuv444.rs deleted file mode 100644 index 5bc3a960..00000000 --- a/src/row/dispatch/yuv444.rs +++ /dev/null @@ -1,1333 +0,0 @@ -//! YUV 4:4:4 dispatchers (planar 8-bit + high-bit 9/10/12/14/16-bit) -//! — RGB + RGBA. Extracted from `row::mod` for organization. -//! -//! Internal `pub(crate)` helpers `yuv_444p_n_to_rgb_row` / -//! `yuv_444p_n_to_rgb_u16_row` provide the BITS-generic dispatch -//! shared by 9/10/12/14-bit; 16-bit gets its own dedicated kernels. -//! -//! All dispatchers route through the standard `cfg_select!` per-arch -//! block; `use_simd = false` forces scalar. - -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; -#[cfg(target_arch = "aarch64")] -use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; -#[cfg(target_arch = "wasm32")] -use crate::row::simd128_available; -use crate::ColorMatrix; - -/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches -/// to the best available SIMD backend for the current target. -/// -/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference -/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma -/// planes, no chroma upsampling, no width parity constraint. See -/// `scalar::yuv_444_to_rgb_row` for the reference implementation. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_444_to_rgb_row( - y: &[u8], - u: &[u8], - v: &[u8], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: `neon_available()` verified NEON is present. - unsafe { - arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX-512BW verified. - unsafe { - arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 verified at compile time. - unsafe { - arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); -} - -/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit). -/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only -/// differences are the per-pixel stride (4 vs 3) and the alpha byte -/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`. -/// `use_simd = false` forces scalar. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv_444_to_rgba_row( - y: &[u8], - u: &[u8], - v: &[u8], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - unsafe { - arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - unsafe { - arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - unsafe { - arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - unsafe { - arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - unsafe { - arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const -/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best -/// available backend for the current target (NEON / SSE4.1 / AVX2 / -/// AVX-512 / wasm simd128), falling back to scalar when no SIMD -/// backend is available or `use_simd` is false. -/// -/// Crate-private — external callers use the concrete -/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] / -/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a -/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1` -/// truncates to `-1` when cast to `i16` in the SIMD clamp), and -/// matches the [`yuv420p10_to_rgb_row`] family's convention of -/// keeping the `` generic internal. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. -/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output. -/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 / -/// AVX-512 / wasm simd128), falling back to scalar when no SIMD -/// backend is available or `use_simd` is false. -/// -/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The -/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a -/// dedicated i64-chroma kernel family. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub(crate) fn yuv_444p_n_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<9>`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the -/// crate-internal `yuv_444p_n_to_rgb_row::<10>`. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 12-bit → u8 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 14-bit → u8 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); -} - -/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the -/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline -/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel). -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); -} - -/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range -/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to -/// i64 to avoid i32 overflow at 16-bit limited range. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgb_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgb_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgb_min = rgb_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); -} -// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- -// -// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch -// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces -// the scalar reference path on every dispatcher. - -/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the -/// source has no alpha plane). -/// -/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the -/// per-pixel stride (4 vs 3) and the constant alpha byte. See -/// `scalar::yuv_444p_n_to_rgba_row` for the reference. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` -/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` -/// (opaque maximum at the input bit depth). -/// -/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference. -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p9_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha -/// element is `1023`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p10_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha -/// element is `4095`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p12_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha -/// element is `16383`. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p14_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** -/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit -/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u8], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_bytes(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); -} - -/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** -/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is -/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar -/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply. -/// -/// `use_simd = false` forces the scalar reference path. -#[cfg_attr(not(tarpaulin), inline(always))] -#[allow(clippy::too_many_arguments)] -pub fn yuv444p16_to_rgba_u16_row( - y: &[u16], - u: &[u16], - v: &[u16], - rgba_out: &mut [u16], - width: usize, - matrix: ColorMatrix, - full_range: bool, - use_simd: bool, -) { - let rgba_min = rgba_row_elems(width); - assert!(y.len() >= width, "y row too short"); - assert!(u.len() >= width, "u row too short"); - assert!(v.len() >= width, "v row too short"); - assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); - - if use_simd { - cfg_select! { - target_arch = "aarch64" => { - if neon_available() { - // SAFETY: NEON verified. - unsafe { - arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "x86_64" => { - if avx512_available() { - // SAFETY: AVX‑512BW verified. - unsafe { - arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if avx2_available() { - // SAFETY: AVX2 verified. - unsafe { - arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - if sse41_available() { - // SAFETY: SSE4.1 verified. - unsafe { - arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - target_arch = "wasm32" => { - if simd128_available() { - // SAFETY: simd128 compile‑time verified. - unsafe { - arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); - } - return; - } - }, - _ => {} - } - } - - scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); -} diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs new file mode 100644 index 00000000..4db06906 --- /dev/null +++ b/src/row/dispatch/yuv444/mod.rs @@ -0,0 +1,197 @@ +//! YUV 4:4:4 dispatchers, split per source format for readability. +//! +//! - `yuv_444` — 8-bit YUV 4:4:4 → RGB / RGBA. +//! - `yuv444p9` / `yuv444p10` / `yuv444p12` / `yuv444p14` — +//! high-bit planar (4 variants per format). RGB / RGB-u16 paths +//! are thin wrappers over the BITS-generic helpers below; the +//! RGBA / RGBA-u16 paths are full dispatchers. +//! - `yuv444p16` — 16-bit planar with its own dedicated dispatchers +//! (the BITS-generic template is pinned to {9, 10, 12, 14}). +//! +//! `yuv_444p_n_to_rgb_row` / `yuv_444p_n_to_rgb_u16_row` +//! are the BITS-generic dispatchers shared by the 9 / 10 / 12 / 14-bit +//! RGB wrappers above. They stay `pub(crate)` and live here at the +//! `yuv444` module root so siblings can reach them via `super::*`. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const +/// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best +/// available backend for the current target (NEON / SSE4.1 / AVX2 / +/// AVX-512 / wasm simd128), falling back to scalar when no SIMD +/// backend is available or `use_simd` is false. +/// +/// Crate-private — external callers use the concrete +/// [`yuv444p10_to_rgb_row`] / [`yuv444p12_to_rgb_row`] / +/// [`yuv444p14_to_rgb_row`] wrappers, which pin `BITS` to a +/// supported value. This avoids the 16-bit footgun (`(1 << 16) - 1` +/// truncates to `-1` when cast to `i16` in the SIMD clamp), and +/// matches the [`yuv420p10_to_rgb_row`] family's convention of +/// keeping the `` generic internal. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn yuv_444p_n_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgb_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar 10/12/14-bit → **native-depth u16** RGB dispatcher. +/// Const generic over `BITS ∈ {10, 12, 14}`. Low-bit-packed output. +/// Dispatches to the best available backend (NEON / SSE4.1 / AVX2 / +/// AVX-512 / wasm simd128), falling back to scalar when no SIMD +/// backend is available or `use_simd` is false. +/// +/// Crate-private — see the note on [`yuv_444p_n_to_rgb_row`]. The +/// 16-bit path is [`yuv444p16_to_rgb_u16_row`], which uses a +/// dedicated i64-chroma kernel family. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub(crate) fn yuv_444p_n_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgb_u16_row::(y, u, v, rgb_out, width, matrix, full_range); +} + +pub(super) mod yuv444p10; +pub(super) mod yuv444p12; +pub(super) mod yuv444p14; +pub(super) mod yuv444p16; +pub(super) mod yuv444p9; +pub(super) mod yuv_444; + +pub use yuv444p10::*; +pub use yuv444p12::*; +pub use yuv444p14::*; +pub use yuv444p16::*; +pub use yuv444p9::*; +pub use yuv_444::*; diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs new file mode 100644 index 00000000..770f286e --- /dev/null +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -0,0 +1,193 @@ +//! 10-bit planar YUV 4:4:4 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; + + +/// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the +/// crate-internal `yuv_444p_n_to_rgb_row::<10>`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 10-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + + +/// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **10-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 1023]`); alpha +/// element is `1023`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p10_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<10>(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs new file mode 100644 index 00000000..15edca7c --- /dev/null +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -0,0 +1,192 @@ +//! 12-bit planar YUV 4:4:4 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; + + +/// YUV 4:4:4 planar 12-bit → u8 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 12-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + + +/// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **12-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 4095]`); alpha +/// element is `4095`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p12_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<12>(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs new file mode 100644 index 00000000..50f39021 --- /dev/null +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -0,0 +1,192 @@ +//! 14-bit planar YUV 4:4:4 dispatchers — 4 variants. + +use crate::row::scalar; +use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; + + +/// YUV 4:4:4 planar 14-bit → u8 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 14-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + + +/// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **14-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, 16383]`); alpha +/// element is `16383`. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p14_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<14>(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs new file mode 100644 index 00000000..adfe2c35 --- /dev/null +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -0,0 +1,304 @@ +//! 16-bit planar YUV 4:4:4 dispatchers — 4 variants. The BITS-generic +//! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets +//! its own dedicated dispatchers (i64 chroma at native u16 output). + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the +/// parallel 16-bit kernel family (same Q15 i32 output-range pipeline +/// as [`yuv_420p16_to_rgb_row`] but with 1:1 chroma per pixel). +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); +} + +/// YUV 4:4:4 planar **16-bit** → packed **u16** RGB (full-range +/// output in `[0, 65535]`). Widens chroma multiply-add + Y scale to +/// i64 to avoid i32 overflow at 16-bit limited range. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. Native 512-bit i64-chroma kernel. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); +} + + +/// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit +/// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **16-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — full-range output `[0, 65535]`; alpha element is +/// `0xFFFF`. Routes through the dedicated 16-bit u16-output scalar +/// kernel (`scalar::yuv_444p16_to_rgba_u16_row`) — i64 chroma multiply. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p16_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p16_to_rgba_u16_row(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs new file mode 100644 index 00000000..2cff1f05 --- /dev/null +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -0,0 +1,209 @@ +//! 9-bit planar YUV 4:4:4 dispatchers — 4 variants. The RGB / RGB-u16 +//! paths are thin wrappers over the BITS-generic helpers in +//! `super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}`; the +//! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic +//! template doesn't apply for the alpha-fill case). + +use crate::row::scalar; +use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + +use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; + + +/// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the +/// crate-internal `yuv_444p_n_to_rgb_row::<9>`. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +/// YUV 4:4:4 planar 9-bit → native-depth u16 RGB. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgb_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgb_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + yuv_444p_n_to_rgb_u16_row::<9>(y, u, v, rgb_out, width, matrix, full_range, use_simd); +} + +// ---- High-bit 4:4:4 RGBA dispatchers (Ship 8 Tranche 7) --------------- +// +// Both u8 and native-depth `u16` RGBA dispatchers route to per-arch +// SIMD kernels (Ship 8 Tranches 7b + 7c). `use_simd = false` forces +// the scalar reference path on every dispatcher. + +/// Converts one row of **9-bit** YUV 4:4:4 to packed **8-bit** +/// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the +/// source has no alpha plane). +/// +/// Same numerical contract as [`yuv444p9_to_rgb_row`] except for the +/// per-pixel stride (4 vs 3) and the constant alpha byte. See +/// `scalar::yuv_444p_n_to_rgba_row` for the reference. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} + +/// Converts one row of **9-bit** YUV 4:4:4 to **native-depth `u16`** +/// packed **RGBA** — output is low-bit-packed (`[0, (1 << 9) - 1]` +/// in the low bits of each `u16`); alpha element is `(1 << 9) - 1` +/// (opaque maximum at the input bit depth). +/// +/// See `scalar::yuv_444p_n_to_rgba_u16_row` for the reference. +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv444p9_to_rgba_u16_row( + y: &[u16], + u: &[u16], + v: &[u16], + rgba_out: &mut [u16], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_elems(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: NEON verified. + unsafe { + arch::neon::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX‑512BW verified. + unsafe { + arch::x86_avx512::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 compile‑time verified. + unsafe { + arch::wasm_simd128::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444p_n_to_rgba_u16_row::<9>(y, u, v, rgba_out, width, matrix, full_range); +} diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs new file mode 100644 index 00000000..8f4352a2 --- /dev/null +++ b/src/row/dispatch/yuv444/yuv_444.rs @@ -0,0 +1,159 @@ +//! 8-bit YUV 4:4:4 → RGB / RGBA dispatchers (`yuv_444_to_rgb_row`, +//! `yuv_444_to_rgba_row`). Extracted from the parent +//! `dispatch::yuv444` module per source format for organization. + +use crate::row::scalar; +use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; +#[cfg(target_arch = "aarch64")] +use crate::row::neon_available; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +#[cfg(target_arch = "wasm32")] +use crate::row::simd128_available; +use crate::ColorMatrix; + + +/// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches +/// to the best available SIMD backend for the current target. +/// +/// Same numerical contract as [`yuv_420_to_rgb_row`]; the difference +/// is 4:4:4 chroma — one U / V pair per Y pixel, full-width chroma +/// planes, no chroma upsampling, no width parity constraint. See +/// `scalar::yuv_444_to_rgb_row` for the reference implementation. +/// +/// `use_simd = false` forces the scalar reference path. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_444_to_rgb_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgb_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgb_min = rgb_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgb_out.len() >= rgb_min, "rgb_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + // SAFETY: `neon_available()` verified NEON is present. + unsafe { + arch::neon::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + // SAFETY: AVX-512BW verified. + unsafe { + arch::x86_avx512::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + // SAFETY: AVX2 verified. + unsafe { + arch::x86_avx2::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + // SAFETY: SSE4.1 verified. + unsafe { + arch::x86_sse41::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + // SAFETY: simd128 verified at compile time. + unsafe { + arch::wasm_simd128::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444_to_rgb_row(y, u, v, rgb_out, width, matrix, full_range); +} + +/// Converts one row of YUV 4:4:4 planar to packed **RGBA** (8-bit). +/// Same numerical contract as [`yuv_444_to_rgb_row`]; the only +/// differences are the per-pixel stride (4 vs 3) and the alpha byte +/// (`0xFF`, opaque, for every pixel). `rgba_out.len() >= 4 * width`. +/// `use_simd = false` forces scalar. +#[cfg_attr(not(tarpaulin), inline(always))] +#[allow(clippy::too_many_arguments)] +pub fn yuv_444_to_rgba_row( + y: &[u8], + u: &[u8], + v: &[u8], + rgba_out: &mut [u8], + width: usize, + matrix: ColorMatrix, + full_range: bool, + use_simd: bool, +) { + let rgba_min = rgba_row_bytes(width); + assert!(y.len() >= width, "y row too short"); + assert!(u.len() >= width, "u row too short"); + assert!(v.len() >= width, "v row too short"); + assert!(rgba_out.len() >= rgba_min, "rgba_out row too short"); + + if use_simd { + cfg_select! { + target_arch = "aarch64" => { + if neon_available() { + unsafe { + arch::neon::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "x86_64" => { + if avx512_available() { + unsafe { + arch::x86_avx512::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if avx2_available() { + unsafe { + arch::x86_avx2::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + if sse41_available() { + unsafe { + arch::x86_sse41::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + target_arch = "wasm32" => { + if simd128_available() { + unsafe { + arch::wasm_simd128::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); + } + return; + } + }, + _ => {} + } + } + + scalar::yuv_444_to_rgba_row(y, u, v, rgba_out, width, matrix, full_range); +} From c1a07310f3727cac298053602090ce728b6e17b4 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:44:43 +1200 Subject: [PATCH 4/6] finish scalar impl for yuv420p --- src/row/arch/x86_avx2.rs | 3 +-- src/row/arch/x86_avx512/tests.rs | 14 ++++++++++++-- src/row/arch/x86_sse41.rs | 8 +++++++- src/row/arch/x86_sse41/tests.rs | 14 ++++++++++++-- src/row/dispatch/bayer.rs | 3 +-- src/row/dispatch/nv.rs | 11 ++++++----- src/row/dispatch/pn.rs | 14 ++++++++------ src/row/dispatch/rgb_ops.rs | 7 +++---- src/row/dispatch/yuv420/mod.rs | 4 ++-- src/row/dispatch/yuv420/p010.rs | 13 ++++++------- src/row/dispatch/yuv420/p012.rs | 13 ++++++------- src/row/dispatch/yuv420/p016.rs | 13 ++++++------- src/row/dispatch/yuv420/yuv420p10.rs | 13 ++++++------- src/row/dispatch/yuv420/yuv420p12.rs | 13 ++++++------- src/row/dispatch/yuv420/yuv420p14.rs | 13 ++++++------- src/row/dispatch/yuv420/yuv420p16.rs | 13 ++++++------- src/row/dispatch/yuv420/yuv420p9.rs | 12 ++++++------ src/row/dispatch/yuv420/yuv_420.rs | 12 ++++++------ src/row/dispatch/yuv444/mod.rs | 16 ++++++++-------- src/row/dispatch/yuv444/yuv444p10.rs | 13 ++++++------- src/row/dispatch/yuv444/yuv444p12.rs | 13 ++++++------- src/row/dispatch/yuv444/yuv444p14.rs | 13 ++++++------- src/row/dispatch/yuv444/yuv444p16.rs | 13 ++++++------- src/row/dispatch/yuv444/yuv444p9.rs | 12 ++++++------ src/row/dispatch/yuv444/yuv_444.rs | 12 ++++++------ src/row/dispatch/yuva.rs | 11 ++++++----- src/row/mod.rs | 10 ++-------- 27 files changed, 158 insertions(+), 148 deletions(-) diff --git a/src/row/arch/x86_avx2.rs b/src/row/arch/x86_avx2.rs index 425609b3..e106b2c5 100644 --- a/src/row/arch/x86_avx2.rs +++ b/src/row/arch/x86_avx2.rs @@ -4029,8 +4029,7 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row(a_vec), diff --git a/src/row/arch/x86_avx512/tests.rs b/src/row/arch/x86_avx512/tests.rs index b3d6af0e..a2fc56e6 100644 --- a/src/row/arch/x86_avx512/tests.rs +++ b/src/row/arch/x86_avx512/tests.rs @@ -3225,8 +3225,18 @@ fn avx512_yuva420p_n_rgba_u16_matches_scalar_widths() { return; } for w in [64usize, 66, 78, 94, 1920, 1922] { - check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>(w, ColorMatrix::Bt601, false, 89); - check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Bt601, + false, + 89, + ); + check_yuv420p_n_u16_avx512_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); } } diff --git a/src/row/arch/x86_sse41.rs b/src/row/arch/x86_sse41.rs index a8935652..a69ea18c 100644 --- a/src/row/arch/x86_sse41.rs +++ b/src/row/arch/x86_sse41.rs @@ -3555,7 +3555,13 @@ pub(crate) unsafe fn yuv_420p16_to_rgb_or_rgba_u16_row(w, ColorMatrix::Bt601, false, 89); - check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>(w, ColorMatrix::Bt709, true, 89); + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<9>( + w, + ColorMatrix::Bt601, + false, + 89, + ); + check_yuv420p_n_u16_sse41_rgba_with_alpha_src_equivalence::<10>( + w, + ColorMatrix::Bt709, + true, + 89, + ); } } diff --git a/src/row/dispatch/bayer.rs b/src/row/dispatch/bayer.rs index 4f45857f..9af7b199 100644 --- a/src/row/dispatch/bayer.rs +++ b/src/row/dispatch/bayer.rs @@ -6,8 +6,7 @@ //! parameter is wired through so callers don't have to touch their //! call sites when SIMD lands. -use crate::row::scalar; -use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems}; +use crate::row::{assert_color_transform_well_formed, rgb_row_bytes, rgb_row_elems, scalar}; /// Converts one row of an 8-bit Bayer plane to packed RGB. /// diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs index b342e6e4..236b1401 100644 --- a/src/row/dispatch/nv.rs +++ b/src/row/dispatch/nv.rs @@ -1,15 +1,16 @@ //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and //! RGBA outputs) extracted from `row::mod` for organization. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, +}; /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB. /// diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs index f2a143c4..19df9ed8 100644 --- a/src/row/dispatch/pn.rs +++ b/src/row/dispatch/pn.rs @@ -11,15 +11,18 @@ //! since they share the 4:2:0 chroma layout with the planar //! yuv420p9/10/12/14/16 family. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, uv_full_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{ + arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems, + }, +}; // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB -------------------- // @@ -793,4 +796,3 @@ pub fn p416_to_rgba_u16_row( scalar::p_n_444_16_to_rgba_u16_row(y, uv_full, rgba_out, width, matrix, full_range); } - diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index c51257d8..24b3a087 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -2,14 +2,13 @@ //! organization. All three route through the standard //! `cfg_select!` per-arch block; `use_simd = false` forces scalar. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; +use crate::row::{arch, rgb_row_bytes, scalar}; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. diff --git a/src/row/dispatch/yuv420/mod.rs b/src/row/dispatch/yuv420/mod.rs index 972210e7..57727688 100644 --- a/src/row/dispatch/yuv420/mod.rs +++ b/src/row/dispatch/yuv420/mod.rs @@ -23,9 +23,9 @@ pub(super) mod yuv_420; pub use p010::*; pub use p012::*; pub use p016::*; +pub use yuv_420::*; +pub use yuv420p9::*; pub use yuv420p10::*; pub use yuv420p12::*; pub use yuv420p14::*; pub use yuv420p16::*; -pub use yuv420p9::*; -pub use yuv_420::*; diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs index 35f9e548..67ad9c95 100644 --- a/src/row/dispatch/yuv420/p010.rs +++ b/src/row/dispatch/yuv420/p010.rs @@ -1,16 +1,16 @@ //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4 //! variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ /// packed — 10 active bits in the high 10 of each `u16`) to packed @@ -164,7 +164,6 @@ pub fn p010_to_rgb_u16_row( scalar::p_n_to_rgb_u16_row::<10>(y, uv_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **P010** (semi-planar 4:2:0, 10-bit, /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs index 618bc8f6..c3058425 100644 --- a/src/row/dispatch/yuv420/p012.rs +++ b/src/row/dispatch/yuv420/p012.rs @@ -1,16 +1,16 @@ //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4 //! variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ /// packed — 12 active bits in the high 12 of each `u16`) to packed @@ -148,7 +148,6 @@ pub fn p012_to_rgb_u16_row( scalar::p_n_to_rgb_u16_row::<12>(y, uv_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **P012** (semi-planar 4:2:0, 12-bit, /// high-bit-packed) to packed **8-bit** **RGBA**. Alpha defaults to /// `0xFF` (opaque). diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs index 128aaf17..765cf596 100644 --- a/src/row/dispatch/yuv420/p016.rs +++ b/src/row/dispatch/yuv420/p016.rs @@ -1,15 +1,15 @@ //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to /// packed **8-bit** RGB. At 16 bits there is no high-bit-packed @@ -140,7 +140,6 @@ pub fn p016_to_rgb_u16_row( scalar::p16_to_rgb_u16_row(y, uv_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **P016** (semi-planar 4:2:0, full 16-bit /// samples) to packed **8-bit** **RGBA**. Alpha defaults to `0xFF`. /// diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 27f7a1ff..8083d5a5 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -1,15 +1,15 @@ //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. /// @@ -195,7 +195,6 @@ pub fn yuv420p10_to_rgb_u16_row( scalar::yuv_420p_n_to_rgb_u16_row::<10>(y, u_half, v_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **10-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the /// source has no alpha plane). diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index 9d250c9a..761b51c0 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -1,15 +1,15 @@ //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. /// @@ -171,7 +171,6 @@ pub fn yuv420p12_to_rgb_u16_row( scalar::yuv_420p_n_to_rgb_u16_row::<12>(y, u_half, v_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **12-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the /// source has no alpha plane). diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index a1c8024f..f9fad7af 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -1,15 +1,15 @@ //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. #[cfg_attr(not(tarpaulin), inline(always))] @@ -160,7 +160,6 @@ pub fn yuv420p14_to_rgb_u16_row( scalar::yuv_420p_n_to_rgb_u16_row::<14>(y, u_half, v_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **14-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`; alpha defaults to opaque since the /// source has no alpha plane). diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index 7b324e7d..b248ce95 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -1,15 +1,15 @@ //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. /// @@ -146,7 +146,6 @@ pub fn yuv420p16_to_rgb_u16_row( scalar::yuv_420p16_to_rgb_u16_row(y, u_half, v_half, rgb_out, width, matrix, full_range); } - /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`). /// diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index c28da34a..69cfb983 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -1,16 +1,16 @@ //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16, //! RGBA, RGBA-u16). -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. /// diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs index 6428c008..19a89147 100644 --- a/src/row/dispatch/yuv420/yuv_420.rs +++ b/src/row/dispatch/yuv420/yuv_420.rs @@ -2,16 +2,16 @@ //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420` //! module per source format for organization. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, +}; /// Converts one row of 4:2:0 YUV to packed RGB. /// diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs index 4db06906..8d7778d9 100644 --- a/src/row/dispatch/yuv444/mod.rs +++ b/src/row/dispatch/yuv444/mod.rs @@ -13,16 +13,16 @@ //! RGB wrappers above. They stay `pub(crate)` and live here at the //! `yuv444` module root so siblings can reach them via `super::*`. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, scalar}, +}; /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const /// generic over `BITS ∈ {10, 12, 14}`. Dispatches to the best @@ -189,9 +189,9 @@ pub(super) mod yuv444p16; pub(super) mod yuv444p9; pub(super) mod yuv_444; +pub use yuv_444::*; +pub use yuv444p9::*; pub use yuv444p10::*; pub use yuv444p12::*; pub use yuv444p14::*; pub use yuv444p16::*; -pub use yuv444p9::*; -pub use yuv_444::*; diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs index 770f286e..118bb23d 100644 --- a/src/row/dispatch/yuv444/yuv444p10.rs +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -1,18 +1,18 @@ //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, +}; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; - /// YUV 4:4:4 planar 10-bit → u8 RGB. Thin wrapper over the /// crate-internal `yuv_444p_n_to_rgb_row::<10>`. #[cfg_attr(not(tarpaulin), inline(always))] @@ -46,7 +46,6 @@ pub fn yuv444p10_to_rgb_u16_row( yuv_444p_n_to_rgb_u16_row::<10>(y, u, v, rgb_out, width, matrix, full_range, use_simd); } - /// Converts one row of **10-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`). /// diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs index 15edca7c..6c1d5787 100644 --- a/src/row/dispatch/yuv444/yuv444p12.rs +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -1,18 +1,18 @@ //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, +}; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; - /// YUV 4:4:4 planar 12-bit → u8 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -45,7 +45,6 @@ pub fn yuv444p12_to_rgb_u16_row( yuv_444p_n_to_rgb_u16_row::<12>(y, u, v, rgb_out, width, matrix, full_range, use_simd); } - /// Converts one row of **12-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`). /// diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs index 50f39021..0ffa3912 100644 --- a/src/row/dispatch/yuv444/yuv444p14.rs +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -1,18 +1,18 @@ //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants. -use crate::row::scalar; -use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, +}; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; - /// YUV 4:4:4 planar 14-bit → u8 RGB. #[cfg_attr(not(tarpaulin), inline(always))] #[allow(clippy::too_many_arguments)] @@ -45,7 +45,6 @@ pub fn yuv444p14_to_rgb_u16_row( yuv_444p_n_to_rgb_u16_row::<14>(y, u, v, rgb_out, width, matrix, full_range, use_simd); } - /// Converts one row of **14-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`). /// diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs index adfe2c35..0352eb74 100644 --- a/src/row/dispatch/yuv444/yuv444p16.rs +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -2,16 +2,16 @@ //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets //! its own dedicated dispatchers (i64 chroma at native u16 output). -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, +}; /// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the /// parallel 16-bit kernel family (same Q15 i32 output-range pipeline @@ -155,7 +155,6 @@ pub fn yuv444p16_to_rgb_u16_row( scalar::yuv_444p16_to_rgb_u16_row(y, u, v, rgb_out, width, matrix, full_range); } - /// Converts one row of **16-bit** YUV 4:4:4 to packed **8-bit** /// **RGBA** (`R, G, B, 0xFF`). Routes through the dedicated 16-bit /// scalar kernel (`scalar::yuv_444p16_to_rgba_row`). diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs index 2cff1f05..e0f02e16 100644 --- a/src/row/dispatch/yuv444/yuv444p9.rs +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -4,19 +4,19 @@ //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic //! template doesn't apply for the alpha-fill case). -use crate::row::scalar; -use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, +}; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; - /// YUV 4:4:4 planar 9-bit → u8 RGB. Thin wrapper over the /// crate-internal `yuv_444p_n_to_rgb_row::<9>`. #[cfg_attr(not(tarpaulin), inline(always))] diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs index 8f4352a2..625ab38f 100644 --- a/src/row/dispatch/yuv444/yuv_444.rs +++ b/src/row/dispatch/yuv444/yuv_444.rs @@ -2,16 +2,16 @@ //! `yuv_444_to_rgba_row`). Extracted from the parent //! `dispatch::yuv444` module per source format for organization. -use crate::row::scalar; -use crate::row::{arch, rgb_row_bytes, rgba_row_bytes}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; - +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, +}; /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches /// to the best available SIMD backend for the current target. diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs index 90399881..ac2cef2e 100644 --- a/src/row/dispatch/yuva.rs +++ b/src/row/dispatch/yuva.rs @@ -3,15 +3,16 @@ //! RGBA and native-depth `u16` RGBA outputs. Extracted from //! `row::mod` for organization. -use crate::row::scalar; -use crate::row::{arch, rgba_row_bytes, rgba_row_elems}; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; -#[cfg(target_arch = "x86_64")] -use crate::row::{avx2_available, avx512_available, sse41_available}; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::ColorMatrix; +#[cfg(target_arch = "x86_64")] +use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::{ + ColorMatrix, + row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, +}; // ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- // diff --git a/src/row/mod.rs b/src/row/mod.rs index 97704767..299e7e25 100644 --- a/src/row/mod.rs +++ b/src/row/mod.rs @@ -41,8 +41,8 @@ //! see no API change from the split. pub(crate) mod arch; -pub(crate) mod scalar; mod dispatch; +pub(crate) mod scalar; // Re-exported only when a caller is compiled. The `MixedSinker` Strategy A // fan-out is the sole consumer, and it lives in `crate::sinker::mixed` which @@ -54,13 +54,7 @@ pub(crate) use scalar::expand_rgb_to_rgba_row; #[cfg(any(feature = "std", feature = "alloc"))] pub(crate) use scalar::expand_rgb_u16_to_rgba_u16_row; -pub use dispatch::bayer::*; -pub use dispatch::nv::*; -pub use dispatch::pn::*; -pub use dispatch::rgb_ops::*; -pub use dispatch::yuv420::*; -pub use dispatch::yuv444::*; -pub use dispatch::yuva::*; +pub use dispatch::{bayer::*, nv::*, pn::*, rgb_ops::*, yuv420::*, yuv444::*, yuva::*}; // `yuv_444p_n_to_rgb_u16_row` is consumed by the 32-bit overflow test // `yuv_444p_n_u16_dispatcher_rejects_width_times_3_overflow` below — From b8b5aea85352b5e3a9cb565076d0d53aa1b1d70b Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:56:33 +1200 Subject: [PATCH 5/6] fix(row/dispatch): gate `arch` import on supported SIMD targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-format split landed `use crate::row::arch;` (folded into the `row::{arch, ...}` import group) in every dispatch sub-file. On targets without a per-arch SIMD backend — i686, powerpc64, riscv64, s390x, etc. — the `cfg_select!` body falls through to the scalar path, every `arch::*` reference is gated out, and clippy's `-D warnings` flag promotes the resulting `unused_imports` to a hard error. CI fails: `miri-tb-i686`, `miri-sb-powerpc64`, `cross (i686-linux-android)`. Fix: lift `arch` out of the bundled `row::{...}` import block in each dispatch file and re-import it under `#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))]`. The three targets gate matches the set that has a SIMD backend in `crate::row::arch::*`. Tested via `RUSTFLAGS=-Dwarnings cargo check --target i686-unknown-linux-gnu --lib` (now clean) plus the host aarch64 / x86_64-freebsd / wasm32 suites still passing 629 tests. Touches every dispatch file that imports `arch`: bayer.rs is intentionally untouched (the Bayer dispatchers are still scalar-only and never reference `arch::*`). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/row/dispatch/nv.rs | 4 +++- src/row/dispatch/pn.rs | 4 +++- src/row/dispatch/rgb_ops.rs | 4 +++- src/row/dispatch/yuv420/p010.rs | 4 +++- src/row/dispatch/yuv420/p012.rs | 4 +++- src/row/dispatch/yuv420/p016.rs | 4 +++- src/row/dispatch/yuv420/yuv420p10.rs | 4 +++- src/row/dispatch/yuv420/yuv420p12.rs | 4 +++- src/row/dispatch/yuv420/yuv420p14.rs | 4 +++- src/row/dispatch/yuv420/yuv420p16.rs | 4 +++- src/row/dispatch/yuv420/yuv420p9.rs | 4 +++- src/row/dispatch/yuv420/yuv_420.rs | 4 +++- src/row/dispatch/yuv444/mod.rs | 4 +++- src/row/dispatch/yuv444/yuv444p10.rs | 4 +++- src/row/dispatch/yuv444/yuv444p12.rs | 4 +++- src/row/dispatch/yuv444/yuv444p14.rs | 4 +++- src/row/dispatch/yuv444/yuv444p16.rs | 4 +++- src/row/dispatch/yuv444/yuv444p9.rs | 4 +++- src/row/dispatch/yuv444/yuv_444.rs | 4 +++- src/row/dispatch/yuva.rs | 4 +++- 20 files changed, 60 insertions(+), 20 deletions(-) diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs index 236b1401..cec38348 100644 --- a/src/row/dispatch/nv.rs +++ b/src/row/dispatch/nv.rs @@ -1,6 +1,8 @@ //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and //! RGBA outputs) extracted from `row::mod` for organization. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -9,7 +11,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, + row::{rgb_row_bytes, rgba_row_bytes, scalar}, }; /// Converts one row of NV12 (semi‑planar 4:2:0) to packed RGB. diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs index 19df9ed8..c0c72363 100644 --- a/src/row/dispatch/pn.rs +++ b/src/row/dispatch/pn.rs @@ -11,6 +11,8 @@ //! since they share the 4:2:0 chroma layout with the planar //! yuv420p9/10/12/14/16 family. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -20,7 +22,7 @@ use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, row::{ - arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems, + rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems, }, }; diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index 24b3a087..86ccd52e 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -2,11 +2,13 @@ //! organization. All three route through the standard //! `cfg_select!` per-arch block; `use_simd = false` forces scalar. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::row::{arch, rgb_row_bytes, scalar}; +use crate::row::{rgb_row_bytes, scalar}; #[cfg(target_arch = "x86_64")] use crate::row::{avx2_available, avx512_available, sse41_available}; diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs index 67ad9c95..46bfbb45 100644 --- a/src/row/dispatch/yuv420/p010.rs +++ b/src/row/dispatch/yuv420/p010.rs @@ -1,6 +1,8 @@ //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4 //! variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -9,7 +11,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **P010** (semi‑planar 4:2:0, 10‑bit, high‑bit‑ diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs index c3058425..8b231627 100644 --- a/src/row/dispatch/yuv420/p012.rs +++ b/src/row/dispatch/yuv420/p012.rs @@ -1,6 +1,8 @@ //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4 //! variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -9,7 +11,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **P012** (semi‑planar 4:2:0, 12‑bit, high‑bit‑ diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs index 765cf596..049f9d3a 100644 --- a/src/row/dispatch/yuv420/p016.rs +++ b/src/row/dispatch/yuv420/p016.rs @@ -1,5 +1,7 @@ //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **P016** (semi-planar 4:2:0, 16-bit) to diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 8083d5a5..3a24eacb 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -1,5 +1,7 @@ //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **10‑bit** YUV 4:2:0 to packed **8‑bit** RGB. diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index 761b51c0..5fc011fa 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -1,5 +1,7 @@ //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **12‑bit** YUV 4:2:0 to packed **8‑bit** RGB. diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index f9fad7af..7097b181 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -1,5 +1,7 @@ //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **14‑bit** YUV 4:2:0 to packed **8‑bit** RGB. diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index b248ce95..c5967784 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -1,5 +1,7 @@ //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **16-bit** YUV 4:2:0 to packed **8-bit** RGB. diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index 69cfb983..9a48e31e 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -1,6 +1,8 @@ //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16, //! RGBA, RGBA-u16). +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -9,7 +11,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// Converts one row of **9‑bit** YUV 4:2:0 to packed **8‑bit** RGB. diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs index 19a89147..7aafbdf7 100644 --- a/src/row/dispatch/yuv420/yuv_420.rs +++ b/src/row/dispatch/yuv420/yuv_420.rs @@ -2,6 +2,8 @@ //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420` //! module per source format for organization. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -10,7 +12,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, + row::{rgb_row_bytes, rgba_row_bytes, scalar}, }; /// Converts one row of 4:2:0 YUV to packed RGB. diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs index 8d7778d9..fe27cc55 100644 --- a/src/row/dispatch/yuv444/mod.rs +++ b/src/row/dispatch/yuv444/mod.rs @@ -13,6 +13,8 @@ //! RGB wrappers above. They stay `pub(crate)` and live here at the //! `yuv444` module root so siblings can reach them via `super::*`. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -21,7 +23,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, scalar}, }; /// YUV 4:4:4 planar 10/12/14-bit → **u8** RGB dispatcher. Const diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs index 118bb23d..21a8f1c9 100644 --- a/src/row/dispatch/yuv444/yuv444p10.rs +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -1,5 +1,7 @@ //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgba_row_bytes, rgba_row_elems, scalar}, }; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs index 6c1d5787..7ecc7066 100644 --- a/src/row/dispatch/yuv444/yuv444p12.rs +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -1,5 +1,7 @@ //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgba_row_bytes, rgba_row_elems, scalar}, }; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs index 0ffa3912..8d6ea884 100644 --- a/src/row/dispatch/yuv444/yuv444p14.rs +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -1,5 +1,7 @@ //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -8,7 +10,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgba_row_bytes, rgba_row_elems, scalar}, }; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs index 0352eb74..bed355b7 100644 --- a/src/row/dispatch/yuv444/yuv444p16.rs +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -2,6 +2,8 @@ //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets //! its own dedicated dispatchers (i64 chroma at native u16 output). +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -10,7 +12,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar}, }; /// YUV 4:4:4 planar **16-bit** → packed **u8** RGB. Uses the diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs index e0f02e16..d4ff1b87 100644 --- a/src/row/dispatch/yuv444/yuv444p9.rs +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -4,6 +4,8 @@ //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic //! template doesn't apply for the alpha-fill case). +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -12,7 +14,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgba_row_bytes, rgba_row_elems, scalar}, }; use super::{yuv_444p_n_to_rgb_row, yuv_444p_n_to_rgb_u16_row}; diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs index 625ab38f..b4cc6298 100644 --- a/src/row/dispatch/yuv444/yuv_444.rs +++ b/src/row/dispatch/yuv444/yuv_444.rs @@ -2,6 +2,8 @@ //! `yuv_444_to_rgba_row`). Extracted from the parent //! `dispatch::yuv444` module per source format for organization. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -10,7 +12,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgb_row_bytes, rgba_row_bytes, scalar}, + row::{rgb_row_bytes, rgba_row_bytes, scalar}, }; /// Converts one row of YUV 4:4:4 planar to packed RGB. Dispatches diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs index ac2cef2e..06bf96d0 100644 --- a/src/row/dispatch/yuva.rs +++ b/src/row/dispatch/yuva.rs @@ -3,6 +3,8 @@ //! RGBA and native-depth `u16` RGBA outputs. Extracted from //! `row::mod` for organization. +#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] @@ -11,7 +13,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{arch, rgba_row_bytes, rgba_row_elems, scalar}, + row::{rgba_row_bytes, rgba_row_elems, scalar}, }; // ---- YUVA 4:4:4 RGBA dispatchers -------------------------------------- From 56be621efb99f5610025613c85668e0c47779246 Mon Sep 17 00:00:00 2001 From: uqio <276879906+uqio@users.noreply.github.com> Date: Tue, 28 Apr 2026 12:57:28 +1200 Subject: [PATCH 6/6] finish scalar impl for yuv420p --- .github/workflows/ci.yml | 2 +- src/row/dispatch/nv.rs | 6 +++++- src/row/dispatch/pn.rs | 10 ++++++---- src/row/dispatch/rgb_ops.rs | 8 ++++++-- src/row/dispatch/yuv420/p010.rs | 6 +++++- src/row/dispatch/yuv420/p012.rs | 6 +++++- src/row/dispatch/yuv420/p016.rs | 6 +++++- src/row/dispatch/yuv420/yuv420p10.rs | 6 +++++- src/row/dispatch/yuv420/yuv420p12.rs | 6 +++++- src/row/dispatch/yuv420/yuv420p14.rs | 6 +++++- src/row/dispatch/yuv420/yuv420p16.rs | 6 +++++- src/row/dispatch/yuv420/yuv420p9.rs | 6 +++++- src/row/dispatch/yuv420/yuv_420.rs | 6 +++++- src/row/dispatch/yuv444/mod.rs | 6 +++++- src/row/dispatch/yuv444/yuv444p10.rs | 6 +++++- src/row/dispatch/yuv444/yuv444p12.rs | 6 +++++- src/row/dispatch/yuv444/yuv444p14.rs | 6 +++++- src/row/dispatch/yuv444/yuv444p16.rs | 6 +++++- src/row/dispatch/yuv444/yuv444p9.rs | 6 +++++- src/row/dispatch/yuv444/yuv_444.rs | 6 +++++- src/row/dispatch/yuva.rs | 6 +++++- 21 files changed, 103 insertions(+), 25 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fdf5548c..0b77ea04 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -186,7 +186,7 @@ jobs: - name: Install Rust run: rustup update stable --no-self-update && rustup default stable - name: Install Intel SDE - uses: petarpetrovt/setup-sde@v3.0 + uses: petarpetrovt/setup-sde@v4.0 with: sdeVersion: 9.33.0 environmentVariableName: SDE_PATH diff --git a/src/row/dispatch/nv.rs b/src/row/dispatch/nv.rs index cec38348..2ec2f153 100644 --- a/src/row/dispatch/nv.rs +++ b/src/row/dispatch/nv.rs @@ -1,7 +1,11 @@ //! NV-family dispatchers (NV12 / NV21 / NV24 / NV42, both RGB and //! RGBA outputs) extracted from `row::mod` for organization. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/pn.rs b/src/row/dispatch/pn.rs index c0c72363..534bfd64 100644 --- a/src/row/dispatch/pn.rs +++ b/src/row/dispatch/pn.rs @@ -11,7 +11,11 @@ //! since they share the 4:2:0 chroma layout with the planar //! yuv420p9/10/12/14/16 family. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; @@ -21,9 +25,7 @@ use crate::row::simd128_available; use crate::row::{avx2_available, avx512_available, sse41_available}; use crate::{ ColorMatrix, - row::{ - rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems, - }, + row::{rgb_row_bytes, rgb_row_elems, rgba_row_bytes, rgba_row_elems, scalar, uv_full_row_elems}, }; // ---- Pn semi-planar 4:4:4 (P410 / P412 / P416) → RGB -------------------- diff --git a/src/row/dispatch/rgb_ops.rs b/src/row/dispatch/rgb_ops.rs index 86ccd52e..0c98bc35 100644 --- a/src/row/dispatch/rgb_ops.rs +++ b/src/row/dispatch/rgb_ops.rs @@ -2,15 +2,19 @@ //! organization. All three route through the standard //! `cfg_select!` per-arch block; `use_simd = false` forces scalar. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; #[cfg(target_arch = "wasm32")] use crate::row::simd128_available; -use crate::row::{rgb_row_bytes, scalar}; #[cfg(target_arch = "x86_64")] use crate::row::{avx2_available, avx512_available, sse41_available}; +use crate::row::{rgb_row_bytes, scalar}; /// Converts one row of packed RGB to planar HSV (OpenCV 8‑bit /// encoding). See `scalar::rgb_to_hsv_row` for semantics. diff --git a/src/row/dispatch/yuv420/p010.rs b/src/row/dispatch/yuv420/p010.rs index 46bfbb45..ba9d95b8 100644 --- a/src/row/dispatch/yuv420/p010.rs +++ b/src/row/dispatch/yuv420/p010.rs @@ -1,7 +1,11 @@ //! P010 (semi-planar 4:2:0, 10-bit high-packed) dispatchers — 4 //! variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/p012.rs b/src/row/dispatch/yuv420/p012.rs index 8b231627..ef1c1301 100644 --- a/src/row/dispatch/yuv420/p012.rs +++ b/src/row/dispatch/yuv420/p012.rs @@ -1,7 +1,11 @@ //! P012 (semi-planar 4:2:0, 12-bit high-packed) dispatchers — 4 //! variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/p016.rs b/src/row/dispatch/yuv420/p016.rs index 049f9d3a..abdf59d1 100644 --- a/src/row/dispatch/yuv420/p016.rs +++ b/src/row/dispatch/yuv420/p016.rs @@ -1,6 +1,10 @@ //! P016 (semi-planar 4:2:0, 16-bit) dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv420p10.rs b/src/row/dispatch/yuv420/yuv420p10.rs index 3a24eacb..349d0623 100644 --- a/src/row/dispatch/yuv420/yuv420p10.rs +++ b/src/row/dispatch/yuv420/yuv420p10.rs @@ -1,6 +1,10 @@ //! 10-bit planar YUV 4:2:0 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv420p12.rs b/src/row/dispatch/yuv420/yuv420p12.rs index 5fc011fa..3b503b74 100644 --- a/src/row/dispatch/yuv420/yuv420p12.rs +++ b/src/row/dispatch/yuv420/yuv420p12.rs @@ -1,6 +1,10 @@ //! 12-bit planar YUV 4:2:0 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv420p14.rs b/src/row/dispatch/yuv420/yuv420p14.rs index 7097b181..50427e59 100644 --- a/src/row/dispatch/yuv420/yuv420p14.rs +++ b/src/row/dispatch/yuv420/yuv420p14.rs @@ -1,6 +1,10 @@ //! 14-bit planar YUV 4:2:0 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv420p16.rs b/src/row/dispatch/yuv420/yuv420p16.rs index c5967784..c681c48b 100644 --- a/src/row/dispatch/yuv420/yuv420p16.rs +++ b/src/row/dispatch/yuv420/yuv420p16.rs @@ -1,6 +1,10 @@ //! 16-bit planar YUV 4:2:0 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv420p9.rs b/src/row/dispatch/yuv420/yuv420p9.rs index 9a48e31e..09cb0156 100644 --- a/src/row/dispatch/yuv420/yuv420p9.rs +++ b/src/row/dispatch/yuv420/yuv420p9.rs @@ -1,7 +1,11 @@ //! 9-bit planar YUV 4:2:0 dispatchers — 4 variants (RGB, RGB-u16, //! RGBA, RGBA-u16). -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv420/yuv_420.rs b/src/row/dispatch/yuv420/yuv_420.rs index 7aafbdf7..80bdea23 100644 --- a/src/row/dispatch/yuv420/yuv_420.rs +++ b/src/row/dispatch/yuv420/yuv_420.rs @@ -2,7 +2,11 @@ //! `yuv_420_to_rgba_row`). Extracted from the parent `dispatch::yuv420` //! module per source format for organization. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/mod.rs b/src/row/dispatch/yuv444/mod.rs index fe27cc55..01ca3861 100644 --- a/src/row/dispatch/yuv444/mod.rs +++ b/src/row/dispatch/yuv444/mod.rs @@ -13,7 +13,11 @@ //! RGB wrappers above. They stay `pub(crate)` and live here at the //! `yuv444` module root so siblings can reach them via `super::*`. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv444p10.rs b/src/row/dispatch/yuv444/yuv444p10.rs index 21a8f1c9..b6836e8e 100644 --- a/src/row/dispatch/yuv444/yuv444p10.rs +++ b/src/row/dispatch/yuv444/yuv444p10.rs @@ -1,6 +1,10 @@ //! 10-bit planar YUV 4:4:4 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv444p12.rs b/src/row/dispatch/yuv444/yuv444p12.rs index 7ecc7066..c4f3e0f4 100644 --- a/src/row/dispatch/yuv444/yuv444p12.rs +++ b/src/row/dispatch/yuv444/yuv444p12.rs @@ -1,6 +1,10 @@ //! 12-bit planar YUV 4:4:4 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv444p14.rs b/src/row/dispatch/yuv444/yuv444p14.rs index 8d6ea884..8b7b7e7b 100644 --- a/src/row/dispatch/yuv444/yuv444p14.rs +++ b/src/row/dispatch/yuv444/yuv444p14.rs @@ -1,6 +1,10 @@ //! 14-bit planar YUV 4:4:4 dispatchers — 4 variants. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv444p16.rs b/src/row/dispatch/yuv444/yuv444p16.rs index bed355b7..87d69fc9 100644 --- a/src/row/dispatch/yuv444/yuv444p16.rs +++ b/src/row/dispatch/yuv444/yuv444p16.rs @@ -2,7 +2,11 @@ //! helpers in `super::*` are pinned to {9,10,12,14}, so 16-bit gets //! its own dedicated dispatchers (i64 chroma at native u16 output). -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv444p9.rs b/src/row/dispatch/yuv444/yuv444p9.rs index d4ff1b87..784ed036 100644 --- a/src/row/dispatch/yuv444/yuv444p9.rs +++ b/src/row/dispatch/yuv444/yuv444p9.rs @@ -4,7 +4,11 @@ //! RGBA / RGBA-u16 paths are full dispatchers (the BITS-generic //! template doesn't apply for the alpha-fill case). -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuv444/yuv_444.rs b/src/row/dispatch/yuv444/yuv_444.rs index b4cc6298..25174964 100644 --- a/src/row/dispatch/yuv444/yuv_444.rs +++ b/src/row/dispatch/yuv444/yuv_444.rs @@ -2,7 +2,11 @@ //! `yuv_444_to_rgba_row`). Extracted from the parent //! `dispatch::yuv444` module per source format for organization. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available; diff --git a/src/row/dispatch/yuva.rs b/src/row/dispatch/yuva.rs index 06bf96d0..34477473 100644 --- a/src/row/dispatch/yuva.rs +++ b/src/row/dispatch/yuva.rs @@ -3,7 +3,11 @@ //! RGBA and native-depth `u16` RGBA outputs. Extracted from //! `row::mod` for organization. -#[cfg(any(target_arch = "aarch64", target_arch = "x86_64", target_arch = "wasm32"))] +#[cfg(any( + target_arch = "aarch64", + target_arch = "x86_64", + target_arch = "wasm32" +))] use crate::row::arch; #[cfg(target_arch = "aarch64")] use crate::row::neon_available;